diff --git a/.gitignore b/.gitignore index 948b5962eb..d032d3d5dd 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ GPATH include/af/version.h src/backend/version.hpp docs/details/examples.dox +/TAGS diff --git a/.gitmodules b/.gitmodules index 395881a861..c91b7f1585 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "test/gtest"] path = test/gtest url = https://chromium.googlesource.com/external/googletest +[submodule "src/backend/cpu/threads"] + path = src/backend/cpu/threads + url = https://github.com/alltheflops/threads.git diff --git a/CMakeLists.txt b/CMakeLists.txt index c79fbcaab0..0def888f6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -CMAKE_MINIMUM_REQUIRED(VERSION 2.8) +CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12) PROJECT(ARRAYFIRE) SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON) @@ -9,7 +9,6 @@ INCLUDE(AFInstallDirs) OPTION(BUILD_TEST "Build Tests" ON) OPTION(BUILD_EXAMPLES "Build Examples" ON) -OPTION(BUILD_GTEST "Download gtest and check for updates. Necessary if you change compilers" ON) OPTION(BUILD_CPU "Build ArrayFire with a CPU backend" ON) @@ -31,9 +30,6 @@ OPTION(BUILD_DOCS "Create ArrayFire Documentation" OFF) OPTION(WITH_COVERAGE "Added code coverage flags" OFF) OPTION(BUILD_NONFREE "Build ArrayFire nonfree algorithms" OFF) -OPTION(BUILD_SIFT "Build ArrayFire nonfree algorithms" OFF) - -MARK_AS_ADVANCED(BUILD_SIFT) OPTION(BUILD_UNIFIED "Build Backend-Independent ArrayFire API" ON) @@ -91,17 +87,18 @@ IF(BUILD_GRAPHICS) ENDIF(BUILD_GRAPHICS) -IF(BUILD_NONFREE) - MESSAGE(WARNING "Building With NONFREE ON requires the following patents") - SET(BUILD_SIFT ON) -ENDIF(BUILD_NONFREE) +IF(${BUILD_NONFREE}) + MESSAGE(WARNING "Building With NONFREE ON requires the following patents") + SET(BUILD_NONFREE_SIFT ON CACHE BOOL "Build ArrayFire with SIFT") + MARK_AS_ADVANCED(BUILD_NONFREE_SIFT) +ELSE(${BUILD_NONFREE}) + UNSET(BUILD_NONFREE_SIFT CACHE) # BUILD_NONFREE_SIFT cannot be built without BUILD_NONFREE +ENDIF(${BUILD_NONFREE}) -IF(BUILD_SIFT) - ADD_DEFINITIONS(-DAF_BUILD_SIFT) +IF(${BUILD_NONFREE_SIFT}) + ADD_DEFINITIONS(-DAF_BUILD_NONFREE_SIFT) - IF (NOT BUILD_NONFREE) - MESSAGE(WARNING "Building with SIFT requires the following patents") - ENDIF() + MESSAGE(WARNING "Building with SIFT requires the following patents") MESSAGE("Method and apparatus for identifying scale invariant features" "in an image and use of same for locating an object in an image,\" David" @@ -110,7 +107,7 @@ IF(BUILD_SIFT) "further details, contact David Lowe (lowe@cs.ubc.ca) or the" "University-Industry Liaison Office of the University of British" "Columbia.") -ENDIF(BUILD_SIFT) +ENDIF(${BUILD_NONFREE_SIFT}) INCLUDE_DIRECTORIES( "${CMAKE_CURRENT_SOURCE_DIR}/include" @@ -154,6 +151,10 @@ ELSE(${UNIX}) #Windows # http://www.kitware.com/blog/home/post/434 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /Gm-") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP /Gm-") + + # Builds that contain debug info require /bigobj + SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") + SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj") ENDIF(MSVC) ENDIF() @@ -223,7 +224,7 @@ ENDIF(FORGE_FOUND AND NOT USE_SYSTEM_FORGE) SET(INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include") SET(BACKEND_DIR "src/backend/\${lowerbackend}") CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfig.cmake.in + ${CMAKE_MODULE_PATH}/ArrayFireConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ArrayFireConfig.cmake @ONLY) @@ -233,11 +234,11 @@ STRING(REGEX REPLACE "[^/]+" ".." reldir "${AF_INSTALL_CMAKE_DIR}") SET(INCLUDE_DIR "\${CMAKE_CURRENT_LIST_DIR}/${reldir}/include") set(BACKEND_DIR) CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfig.cmake.in + ${CMAKE_MODULE_PATH}/ArrayFireConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/Install/ArrayFireConfig.cmake @ONLY) CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfigVersion.cmake.in + ${CMAKE_MODULE_PATH}/ArrayFireConfigVersion.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ArrayFireConfigVersion.cmake @ONLY) INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/Install/ArrayFireConfig.cmake @@ -265,4 +266,4 @@ ENDIF(APPLE) ## # Packaging ## -include(${CMAKE_CURRENT_SOURCE_DIR}/CPack.cmake) +include(${CMAKE_MODULE_PATH}/CPackConfig.cmake) diff --git a/ArrayFireConfig.cmake.in b/CMakeModules/ArrayFireConfig.cmake.in similarity index 100% rename from ArrayFireConfig.cmake.in rename to CMakeModules/ArrayFireConfig.cmake.in diff --git a/ArrayFireConfigVersion.cmake.in b/CMakeModules/ArrayFireConfigVersion.cmake.in similarity index 100% rename from ArrayFireConfigVersion.cmake.in rename to CMakeModules/ArrayFireConfigVersion.cmake.in diff --git a/CPack.cmake b/CMakeModules/CPackConfig.cmake similarity index 98% rename from CPack.cmake rename to CMakeModules/CPackConfig.cmake index 2e7f1d5a03..de242a99b7 100644 --- a/CPack.cmake +++ b/CMakeModules/CPackConfig.cmake @@ -1,6 +1,6 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8) -include("${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/Version.cmake") +INCLUDE("${CMAKE_MODULE_PATH}/Version.cmake") # CPack package generation #SET(CPACK_GENERATOR "TGZ;STGZ") diff --git a/CMakeModules/FindCBLAS.cmake b/CMakeModules/FindCBLAS.cmake index b0cd3bdca0..db1d783e9e 100644 --- a/CMakeModules/FindCBLAS.cmake +++ b/CMakeModules/FindCBLAS.cmake @@ -53,19 +53,48 @@ SET(CBLAS_ROOT_DIR CACHE STRING INCLUDE(CheckTypeSize) CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP) -SET(CBLAS_LIB_DIR) +IF (NOT INTEL_MKL_ROOT_DIR) + SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT}) +ENDIF() -SET(CBLAS_ROOT_DIR "${INTEL_MKL_ROOT_DIR}") +IF(NOT CBLAS_ROOT_DIR) -IF(CBLAS_ROOT_DIR) - IF(INTEL_MKL_ROOT_DIR) - IF ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(CBLAS_LIB_DIR "${INTEL_MKL_ROOT_DIR}/lib/intel64") - ELSE() - SET(CBLAS_LIB_DIR "${INTEL_MKL_ROOT_DIR}/lib/ia32") - ENDIF() + IF (ENV{CBLASDIR}) + SET(CBLAS_ROOT_DIR $ENV{CBLASDIR}) + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib64") + ELSE() + SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib") + ENDIF() + ENDIF() + + IF (ENV{CBLAS_ROOT_DIR}) + SET(CBLAS_ROOT_DIR $ENV{CBLAS_ROOT_DIR}) + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib64") + ELSE() + SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib") ENDIF() - SET(CBLAS_INCLUDE_DIR "${INTEL_MKL_ROOT_DIR}/include") + ENDIF() + + IF (INTEL_MKL_ROOT_DIR) + SET(CBLAS_ROOT_DIR ${INTEL_MKL_ROOT_DIR}) + IF(APPLE) + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib") + ELSE() + SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib") + ENDIF() + ELSE(APPLE) # Windows and Linux + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib/intel64") + ELSE() + SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib/ia32") + ENDIF() + ENDIF(APPLE) + ENDIF() + + SET(CBLAS_INCLUDE_DIR "${CBLAS_ROOT_DIR}/include") ENDIF() # Old CBLAS search @@ -116,14 +145,14 @@ MACRO(CHECK_ALL_LIBRARIES NAMES ${_library} PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV DYLD_LIBRARY_PATH - "{CBLAS_LIB_DIR}" + "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}" ) ELSE(APPLE) FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library} PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH - "${CBLAS_LIB_DIR}" + "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}" PATH_SUFFIXES atlas ) IF(NOT ${_prefix}_${library}_LIBRARY) @@ -132,7 +161,7 @@ MACRO(CHECK_ALL_LIBRARIES NAMES ${_library} PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH - "${CBLAS_LIB_DIR}" + "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}" PATH_SUFFIXES atlas ) ENDIF(NOT ${_prefix}_${library}_LIBRARY) @@ -194,6 +223,23 @@ MACRO(CHECK_ALL_LIBRARIES ENDIF(NOT _libraries_work) ENDMACRO(CHECK_ALL_LIBRARIES) +# MKL CBLAS library? +IF(NOT CBLAS_LIBRARIES) + CHECK_ALL_LIBRARIES( + CBLAS_LIBRARIES + CBLAS + cblas_dgemm + "" + "mkl_rt" + "mkl_cblas.h" + FALSE, + TRUE) +ENDIF(NOT CBLAS_LIBRARIES) + +IF(CBLAS_LIBRARIES) + SET(MKL_FOUND ON) +ENDIF() + # Apple CBLAS library? IF(NOT CBLAS_LIBRARIES) CHECK_ALL_LIBRARIES( diff --git a/CMakeModules/FindFFTW.cmake b/CMakeModules/FindFFTW.cmake index a725f64ecd..3156cec89b 100644 --- a/CMakeModules/FindFFTW.cmake +++ b/CMakeModules/FindFFTW.cmake @@ -24,6 +24,25 @@ IF(NOT FFTW_ROOT AND ENV{FFTWDIR}) SET(FFTW_ROOT $ENV{FFTWDIR}) ENDIF() +IF (NOT INTEL_MKL_ROOT_DIR) + SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT}) +ENDIF() + +IF(NOT FFTW_ROOT) + + IF (ENV{FFTWDIR}) + SET(FFTW_ROOT $ENV{FFTWDIR}) + ENDIF() + + IF (ENV{FFTW_ROOT_DIR}) + SET(FFTW_ROOT $ENV{FFTW_ROOT_DIR}) + ENDIF() + + IF (INTEL_MKL_ROOT_DIR) + SET(FFTW_ROOT ${INTEL_MKL_ROOT_DIR}) + ENDIF() +ENDIF() + # Check if we can use PkgConfig FIND_PACKAGE(PkgConfig) @@ -44,14 +63,14 @@ IF(FFTW_ROOT) #find libs FIND_LIBRARY( FFTW_LIB - NAMES "fftw3" "libfftw3-3" "fftw3-3" + NAMES "fftw3" "libfftw3-3" "fftw3-3" "mkl_rt" PATHS ${FFTW_ROOT} PATH_SUFFIXES "lib" "lib64" NO_DEFAULT_PATH ) FIND_LIBRARY( FFTWF_LIB - NAMES "fftw3f" "libfftw3f-3" "fftw3f-3" + NAMES "fftw3f" "libfftw3f-3" "fftw3f-3" "mkl_rt" PATHS ${FFTW_ROOT} PATH_SUFFIXES "lib" "lib64" NO_DEFAULT_PATH @@ -62,18 +81,18 @@ IF(FFTW_ROOT) FFTW_INCLUDES NAMES "fftw3.h" PATHS ${FFTW_ROOT} - PATH_SUFFIXES "include" + PATH_SUFFIXES "include" "include/fftw" NO_DEFAULT_PATH ) ELSE() FIND_LIBRARY( FFTW_LIB - NAMES "fftw3" + NAMES "fftw3" "mkl_rt" PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR} ) FIND_LIBRARY( FFTWF_LIB - NAMES "fftw3f" + NAMES "fftw3f" "mkl_rt" PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR} ) FIND_PATH( diff --git a/CMakeModules/FindGLEWmx.cmake b/CMakeModules/FindGLEWmx.cmake index b90919eb98..a6da72bbf2 100644 --- a/CMakeModules/FindGLEWmx.cmake +++ b/CMakeModules/FindGLEWmx.cmake @@ -55,7 +55,6 @@ ELSE (WIN32) /sw/lib /opt/local/lib ${GLEW_ROOT_DIR}/lib - NO_DEFAULT_PATH DOC "The GLEWmx library") SET(PX ${CMAKE_STATIC_LIBRARY_PREFIX}) @@ -72,7 +71,6 @@ ELSE (WIN32) /sw/lib /opt/local/lib ${GLEW_ROOT_DIR}/lib - NO_DEFAULT_PATH DOC "The GLEWmx library") UNSET(PX) UNSET(SX) diff --git a/CMakeModules/FindLAPACKE.cmake b/CMakeModules/FindLAPACKE.cmake index 3bf8a1f362..0732cfaa83 100644 --- a/CMakeModules/FindLAPACKE.cmake +++ b/CMakeModules/FindLAPACKE.cmake @@ -9,15 +9,33 @@ # LAPACK_INCLUDES ... LAPACKE include directory # -IF(NOT LAPACKE_ROOT AND ENV{LAPACKEDIR}) - SET(LAPACKE_ROOT $ENV{LAPACKEDIR}) +SET(LAPACKE_ROOT_DIR CACHE STRING + "Root directory for custom LAPACK implementation") + +IF (NOT INTEL_MKL_ROOT_DIR) + SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT}) +ENDIF() + +IF(NOT LAPACKE_ROOT_DIR) + + IF (ENV{LAPACKEDIR}) + SET(LAPACKE_ROOT_DIR $ENV{LAPACKEDIR}) + ENDIF() + + IF (ENV{LAPACKE_ROOT_DIR_DIR}) + SET(LAPACKE_ROOT_DIR $ENV{LAPACKE_ROOT_DIR}) + ENDIF() + + IF (INTEL_MKL_ROOT_DIR) + SET(LAPACKE_ROOT_DIR ${INTEL_MKL_ROOT_DIR}) + ENDIF() ENDIF() # Check if we can use PkgConfig FIND_PACKAGE(PkgConfig) #Determine from PKG -IF(PKG_CONFIG_FOUND AND NOT LAPACKE_ROOT) +IF(PKG_CONFIG_FOUND AND NOT LAPACKE_ROOT_DIR) PKG_CHECK_MODULES( PC_LAPACKE QUIET "lapacke") ENDIF() @@ -48,40 +66,41 @@ IF(PC_LAPACKE_FOUND) ELSE(PC_LAPACKE_FOUND) - IF(LAPACKE_ROOT) + IF(LAPACKE_ROOT_DIR) #find libs FIND_LIBRARY( LAPACKE_LIB - NAMES "lapacke" "LAPACKE" "liblapacke" - PATHS ${LAPACKE_ROOT} - PATH_SUFFIXES "lib" "lib64" + NAMES "lapacke" "LAPACKE" "liblapacke" "mkl_rt" + PATHS ${LAPACKE_ROOT_DIR} + PATH_SUFFIXES "lib" "lib64" "lib/ia32" "lib/intel64" DOC "LAPACKE Library" NO_DEFAULT_PATH ) FIND_LIBRARY( LAPACK_LIB - NAMES "lapack" "LAPACK" "liblapack" - PATHS ${LAPACKE_ROOT} - PATH_SUFFIXES "lib" "lib64" + NAMES "lapack" "LAPACK" "liblapack" "mkl_rt" + PATHS ${LAPACKE_ROOT_DIR} + PATH_SUFFIXES "lib" "lib64" "lib/ia32" "lib/intel64" DOC "LAPACK Library" NO_DEFAULT_PATH ) FIND_PATH( LAPACKE_INCLUDES - NAMES "lapacke.h" - PATHS ${LAPACKE_ROOT} + NAMES "lapacke.h" "mkl_lapacke.h" + PATHS ${LAPACKE_ROOT_DIR} PATH_SUFFIXES "include" DOC "LAPACKE Include Directory" NO_DEFAULT_PATH ) - ELSE() FIND_LIBRARY( LAPACKE_LIB - NAMES "lapacke" "liblapacke" "openblas" + NAMES "lapacke" "liblapacke" "openblas" "mkl_rt" PATHS ${PC_LAPACKE_LIBRARY_DIRS} ${LIB_INSTALL_DIR} + /opt/intel/mkl/lib/ia32 + /opt/intel/mkl/lib/intel64 /usr/lib64 /usr/lib /usr/local/lib64 @@ -92,10 +111,12 @@ ELSE(PC_LAPACKE_FOUND) ) FIND_LIBRARY( LAPACK_LIB - NAMES "lapack" "liblapack" "openblas" + NAMES "lapack" "liblapack" "openblas" "mkl_rt" PATHS ${PC_LAPACKE_LIBRARY_DIRS} ${LIB_INSTALL_DIR} + /opt/intel/mkl/lib/ia32 + /opt/intel/mkl/lib/intel64 /usr/lib64 /usr/lib /usr/local/lib64 @@ -106,21 +127,26 @@ ELSE(PC_LAPACKE_FOUND) ) FIND_PATH( LAPACKE_INCLUDES - NAMES "lapacke.h" + NAMES "lapacke.h" "mkl_lapacke.h" PATHS ${PC_LAPACKE_INCLUDE_DIRS} ${INCLUDE_INSTALL_DIR} + /opt/intel/mkl/include /usr/include /usr/local/include /sw/include /opt/local/include DOC "LAPACKE Include Directory" ) - ENDIF(LAPACKE_ROOT) + ENDIF(LAPACKE_ROOT_DIR) ENDIF(PC_LAPACKE_FOUND) -SET(LAPACK_LIBRARIES ${LAPACKE_LIB} ${LAPACK_LIB}) -SET(LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDES}) +IF(LAPACKE_LIB AND LAPACK_LIB) + SET(LAPACK_LIBRARIES ${LAPACKE_LIB} ${LAPACK_LIB}) +ENDIF() +IF(LAPACKE_INCLUDES) + SET(LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDES}) +ENDIF() INCLUDE(FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(LAPACK DEFAULT_MSG diff --git a/CMakeModules/Version.cmake b/CMakeModules/Version.cmake index cd5149bd25..8d5b575399 100644 --- a/CMakeModules/Version.cmake +++ b/CMakeModules/Version.cmake @@ -2,8 +2,8 @@ # Make a version file that includes the ArrayFire version and git revision # SET(AF_VERSION_MAJOR "3") -SET(AF_VERSION_MINOR "2") -SET(AF_VERSION_PATCH "2") +SET(AF_VERSION_MINOR "3") +SET(AF_VERSION_PATCH "0") SET(AF_VERSION "${AF_VERSION_MAJOR}.${AF_VERSION_MINOR}.${AF_VERSION_PATCH}") SET(AF_API_VERSION_CURRENT ${AF_VERSION_MAJOR}${AF_VERSION_MINOR}) @@ -32,6 +32,11 @@ EXECUTE_PROCESS( OUTPUT_STRIP_TRAILING_WHITESPACE ) +IF(NOT GIT_COMMIT_HASH) + MESSAGE(STATUS "No git. Setting hash to default") + SET(GIT_COMMIT_HASH "default") +ENDIF() + CONFIGURE_FILE( ${CMAKE_MODULE_PATH}/version.h.in ${CMAKE_SOURCE_DIR}/include/af/version.h diff --git a/CMakeModules/build_boost_compute.cmake b/CMakeModules/build_boost_compute.cmake index c0de1cb291..03c20435a8 100644 --- a/CMakeModules/build_boost_compute.cmake +++ b/CMakeModules/build_boost_compute.cmake @@ -1,6 +1,9 @@ -SET(VER 79aa8f9086fdf6ef6db78e889de0273b0eb7bd19) -SET(URL https://github.com/boostorg/compute/archive/${VER}.tar.gz) -SET(MD5 dba3318cbdac912dddce71f2a38ffa43) +# If using a commit, remove the v prefix to VER in URL. +# If using a tag, don't use v in VER +# This is because of how github handles it's release tar balls +SET(VER 0.5) +SET(URL https://github.com/boostorg/compute/archive/v${VER}.tar.gz) +SET(MD5 69a52598ac539d3b7f6005a3dd2b6f58) SET(thirdPartyDir "${CMAKE_BINARY_DIR}/third_party") SET(srcDir "${thirdPartyDir}/compute-${VER}") diff --git a/CMakeModules/build_clBLAS.cmake b/CMakeModules/build_clBLAS.cmake index 6cb1ae8aaf..2289c26393 100644 --- a/CMakeModules/build_clBLAS.cmake +++ b/CMakeModules/build_clBLAS.cmake @@ -14,7 +14,7 @@ ENDIF() ExternalProject_Add( clBLAS-ext GIT_REPOSITORY https://github.com/arrayfire/clBLAS.git - GIT_TAG 102c832825e8e4d60ad73ca97e95668463294068 + GIT_TAG af3.3.0 PREFIX "${prefix}" INSTALL_DIR "${prefix}" UPDATE_COMMAND "" diff --git a/CMakeModules/build_clFFT.cmake b/CMakeModules/build_clFFT.cmake index e1dbb3fe1c..2ab9ccc1ea 100644 --- a/CMakeModules/build_clFFT.cmake +++ b/CMakeModules/build_clFFT.cmake @@ -14,7 +14,7 @@ ENDIF() ExternalProject_Add( clFFT-ext GIT_REPOSITORY https://github.com/arrayfire/clFFT.git - GIT_TAG 1597f0f35a644789c7ad77efe79014236cca2fab + GIT_TAG af3.3.0 PREFIX "${prefix}" INSTALL_DIR "${prefix}" UPDATE_COMMAND "" diff --git a/CMakeModules/osx_install/OSXInstaller.cmake b/CMakeModules/osx_install/OSXInstaller.cmake index dc3a8b2491..b2514f8e2a 100644 --- a/CMakeModules/osx_install/OSXInstaller.cmake +++ b/CMakeModules/osx_install/OSXInstaller.cmake @@ -8,8 +8,75 @@ SET(BIN2CPP_PROGRAM "bin2cpp") SET(OSX_INSTALL_DIR ${CMAKE_MODULE_PATH}/osx_install) +################################################################################ +## Create Directory Structure +################################################################################ +SET(OSX_TEMP "${CMAKE_BINARY_DIR}/osx_install_files") + +# Common files - libforge, ArrayFireConfig*.cmake +FILE(GLOB COMMONLIB "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_LIB_DIR}/libforge*.dylib") +FILE(GLOB COMMONCMAKE "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_CMAKE_DIR}/ArrayFireConfig*.cmake") + +ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_COMMON) +FOREACH(SRC ${COMMONLIB} ${COMMONCMAKE}) + FILE(RELATIVE_PATH SRC_REL ${CMAKE_INSTALL_PREFIX} ${SRC}) + ADD_CUSTOM_COMMAND(TARGET OSX_INSTALL_SETUP_COMMON PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${SRC} "${OSX_TEMP}/common/${SRC_REL}" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying Common files to temporary OSX Install Dir" + ) +ENDFOREACH() + +# Backends - CPU, CUDA, OpenCL, Unified +MACRO(OSX_INSTALL_SETUP BACKEND LIB) + FILE(GLOB ${BACKEND}LIB "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_LIB_DIR}/lib${LIB}*.dylib") + FILE(GLOB ${BACKEND}CMAKE "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_CMAKE_DIR}/ArrayFire${BACKEND}*.cmake") + + ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_${BACKEND}) + FOREACH(SRC ${${BACKEND}LIB} ${${BACKEND}CMAKE}) + FILE(RELATIVE_PATH SRC_REL ${CMAKE_INSTALL_PREFIX} ${SRC}) + ADD_CUSTOM_COMMAND(TARGET OSX_INSTALL_SETUP_${BACKEND} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${SRC} "${OSX_TEMP}/${BACKEND}/${SRC_REL}" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying ${BACKEND} files to temporary OSX Install Dir" + ) + ENDFOREACH() +ENDMACRO(OSX_INSTALL_SETUP) + +OSX_INSTALL_SETUP(CPU afcpu) +OSX_INSTALL_SETUP(CUDA afcuda) +OSX_INSTALL_SETUP(OpenCL afopencl) +OSX_INSTALL_SETUP(Unified af) + +# Headers +ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_INCLUDE + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_INSTALL_PREFIX}/include "${OSX_TEMP}/include" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying header files to temporary OSX Install Dir" + ) + +# Examples +ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_EXAMPLES + COMMAND ${CMAKE_COMMAND} -E copy_directory + "${CMAKE_INSTALL_PREFIX}/share/ArrayFire/examples" "${OSX_TEMP}/examples" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying examples files to temporary OSX Install Dir" + ) + +# Documentation +ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_DOC + COMMAND ${CMAKE_COMMAND} -E copy_directory + "${CMAKE_INSTALL_PREFIX}/share/ArrayFire/doc" "${OSX_TEMP}/doc" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying documentation files to temporary OSX Install Dir" + ) +################################################################################ + FUNCTION(PKG_BUILD) - CMAKE_PARSE_ARGUMENTS(ARGS "" "INSTALL_LOCATION;IDENTIFIER;PATH_TO_FILES;PKG_NAME;TARGETS;SCRIPT_DIR" "FILTERS" ${ARGN}) + CMAKE_PARSE_ARGUMENTS(ARGS "" "DEPENDS;INSTALL_LOCATION;IDENTIFIER;PATH_TO_FILES;PKG_NAME;TARGETS;SCRIPT_DIR" "FILTERS" ${ARGN}) FOREACH(filter ${ARGS_FILTERS}) LIST(APPEND FILTER_LIST --filter ${filter}) @@ -70,50 +137,69 @@ ENDFUNCTION(PRODUCT_BUILD) PKG_BUILD( PKG_NAME ArrayFireCPU - DEPENDS afcpu + DEPENDS OSX_INSTALL_SETUP_CPU TARGETS cpu_package - INSTALL_LOCATION /usr/local/lib + INSTALL_LOCATION /usr/local SCRIPT_DIR ${OSX_INSTALL_DIR}/cpu_scripts IDENTIFIER com.arrayfire.pkg.arrayfire.cpu.lib - PATH_TO_FILES package/lib + PATH_TO_FILES ${OSX_TEMP}/CPU FILTERS opencl cuda unified) PKG_BUILD( PKG_NAME ArrayFireCUDA - DEPENDS afcuda + DEPENDS OSX_INSTALL_SETUP_CUDA TARGETS cuda_package - INSTALL_LOCATION /usr/local/lib + INSTALL_LOCATION /usr/local SCRIPT_DIR ${OSX_INSTALL_DIR}/cuda_scripts IDENTIFIER com.arrayfire.pkg.arrayfire.cuda.lib - PATH_TO_FILES package/lib + PATH_TO_FILES ${OSX_TEMP}/CUDA FILTERS cpu opencl unified) PKG_BUILD( PKG_NAME ArrayFireOPENCL - DEPENDS afopencl + DEPENDS OSX_INSTALL_SETUP_OpenCL TARGETS opencl_package - INSTALL_LOCATION /usr/local/lib + INSTALL_LOCATION /usr/local IDENTIFIER com.arrayfire.pkg.arrayfire.opencl.lib - PATH_TO_FILES package/lib + PATH_TO_FILES ${OSX_TEMP}/OpenCL FILTERS cpu cuda unified) PKG_BUILD( PKG_NAME ArrayFireUNIFIED - DEPENDS af + DEPENDS OSX_INSTALL_SETUP_Unified TARGETS unified_package - INSTALL_LOCATION /usr/local/lib + INSTALL_LOCATION /usr/local IDENTIFIER com.arrayfire.pkg.arrayfire.unified.lib - PATH_TO_FILES package/lib + PATH_TO_FILES ${OSX_TEMP}/Unified FILTERS cpu cuda opencl) +PKG_BUILD( PKG_NAME ArrayFireCommon + DEPENDS OSX_INSTALL_SETUP_COMMON + TARGETS common_package + INSTALL_LOCATION /usr/local + IDENTIFIER com.arrayfire.pkg.arrayfire.libcommon + PATH_TO_FILES ${OSX_TEMP}/common + FILTERS cpu cuda opencl unified) + PKG_BUILD( PKG_NAME ArrayFireHeaders + DEPENDS OSX_INSTALL_SETUP_INCLUDE TARGETS header_package INSTALL_LOCATION /usr/local/include IDENTIFIER com.arrayfire.pkg.arrayfire.inc - PATH_TO_FILES package/include) - -PKG_BUILD( PKG_NAME ArrayFireExtra - TARGETS extra_package - INSTALL_LOCATION /usr/local/share - IDENTIFIER com.arrayfire.pkg.arrayfire.extra - PATH_TO_FILES package/share) - -PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${header_package} ${extra_package}) + PATH_TO_FILES ${OSX_TEMP}/include) + +PKG_BUILD( PKG_NAME ArrayFireExamples + DEPENDS OSX_INSTALL_SETUP_EXAMPLES + TARGETS examples_package + INSTALL_LOCATION /usr/local/share/ArrayFire/examples + IDENTIFIER com.arrayfire.pkg.arrayfire.examples + PATH_TO_FILES ${OSX_TEMP}/examples + FILTERS cmake) + +PKG_BUILD( PKG_NAME ArrayFireDoc + DEPENDS OSX_INSTALL_SETUP_DOC + TARGETS doc_package + INSTALL_LOCATION /usr/local/share/ArrayFire/doc + IDENTIFIER com.arrayfire.pkg.arrayfire.doc + PATH_TO_FILES ${OSX_TEMP}/doc + FILTERS cmake) + +PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${common_package} ${header_package} ${examples_package} ${doc_package}) diff --git a/CMakeModules/osx_install/distribution.dist b/CMakeModules/osx_install/distribution.dist index 3dc82379c9..b476bf013f 100644 --- a/CMakeModules/osx_install/distribution.dist +++ b/CMakeModules/osx_install/distribution.dist @@ -17,7 +17,9 @@ ArrayFireOPENCL.pkg ArrayFireUNIFIED.pkg ArrayFireHeaders.pkg - ArrayFireExtra.pkg + ArrayFireExamples.pkg + ArrayFireDoc.pkg + ArrayFireCommon.pkg @@ -25,38 +27,51 @@ + - + + + + + - - + + + + + diff --git a/CMakeModules/osx_install/readme.html b/CMakeModules/osx_install/readme.html index 41d4ab8cf0..482b7add7e 100644 --- a/CMakeModules/osx_install/readme.html +++ b/CMakeModules/osx_install/readme.html @@ -5,18 +5,9 @@

Install Directories

  • Libraries will be installed in /usr/local/lib
  • Headers will be installed in /usr/local/include
  • -
  • Docs and other files will be installed in /usr/local/share
  • -
- -

Major Updates

-
    -
  • ArrayFire is now open source
  • -
  • Major changes to the visualization library
  • -
  • Introducing handle based C API
  • -
  • New backend: CPU fallback available for systems without GPUs
  • -
  • Dense linear algebra functions available for all backends
  • -
  • Support for 64 bit integers
  • +
  • Examples, documentation and CMake config files will be installed in /usr/local/share
+

For complete list of updates, visit ArrayFire Release Notes

diff --git a/README.md b/README.md index 695adbed03..f43b9fd098 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,8 @@ ArrayFire binary installers can be downloaded at the [ArrayFire Downloads](http: ### Build Status | | Linux x86 | Linux armv7l | Linux aarch64 | Windows | OSX | |:-------:|:---------:|:------------:|:-------------:|:-------:|:---:| -| Build | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/devel)](http://ci.arrayfire.org/job/arrayfire-linux/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/devel)](http://ci.arrayfire.org/job/arrayfire-windows/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/devel)](http://ci.arrayfire.org/job/arrayfire-osx/branch/devel/) | -| Test | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux-test/devel)](http://ci.arrayfire.org/job/arrayfire-linux-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows-test/devel)](http://ci.arrayfire.org/job/arrayfire-windows-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx-test/devel)](http://ci.arrayfire.org/job/arrayfire-osx-test/branch/devel/) | +| Build | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/build/devel)](http://ci.arrayfire.org/job/arrayfire-linux/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/build/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/build/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/build/devel)](http://ci.arrayfire.org/job/arrayfire-windows/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/build/devel)](http://ci.arrayfire.org/job/arrayfire-osx/job/build/branch/devel/) | +| Test | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/test/devel)](http://ci.arrayfire.org/job/arrayfire-linux/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/test/devel)](http://ci.arrayfire.org/job/arrayfire-windows/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/test/devel)](http://ci.arrayfire.org/job/arrayfire-osx/job/test/branch/devel/) | Test coverage: [![Coverage Status](https://coveralls.io/repos/arrayfire/arrayfire/badge.svg?branch=HEAD)](https://coveralls.io/r/arrayfire/arrayfire?branch=HEAD) diff --git a/assets b/assets index 8030a5c626..f16f8bf74f 160000 --- a/assets +++ b/assets @@ -1 +1 @@ -Subproject commit 8030a5c626777a5b3f46b319dd4d1723eca4b0f9 +Subproject commit f16f8bf74fe4a255db05884cfff8f5cb0e6e8e09 diff --git a/docs/arrayfire.css b/docs/arrayfire.css index 75dba64e3a..e4fe2860be 100644 --- a/docs/arrayfire.css +++ b/docs/arrayfire.css @@ -52,12 +52,6 @@ a.codeRef, a.codeRef:visited, a.lineRef, a.lineRef:visited color : #4665A2; } -@font-face -{ - font-family : prototype; - src : url('Prototype.ttf'); -} - /*image and image groups*/ div.image_group { @@ -96,7 +90,6 @@ div.support * #under_logo { - font-family : prototype; font-size : 2em; max-width : 25px; color : #000000; @@ -104,7 +97,6 @@ div.support * #projectbrief { - font-family : prototype; color : #555555 } @@ -121,7 +113,6 @@ div.support * #projectname { - font-family : prototype; font-size : 3em; max-width : 25px; color : #555555 diff --git a/docs/details/arith.dox b/docs/details/arith.dox index 50f82aafed..a75c3a2cc4 100644 --- a/docs/details/arith.dox +++ b/docs/details/arith.dox @@ -448,8 +448,6 @@ Raise an array to a power Exponential of input -\copydoc arith_real_only - \defgroup arith_func_expm1 expm1 diff --git a/docs/details/backend.dox b/docs/details/backend.dox index 4d9cdf6f53..893567b696 100644 --- a/docs/details/backend.dox +++ b/docs/details/backend.dox @@ -71,5 +71,23 @@ The return value specifies which backend the array was created on. ======================================================================= +\defgroup unified_func_getactivebackend getActiveBackend + +\brief Get's the backend enum for the active backend + +\ingroup unified_func +\ingroup arrayfire_func + +======================================================================= + +\defgroup unified_func_getdeviceid getDeviceId + +\brief Get's the id of the device an array was created on. + +\ingroup unified_func +\ingroup arrayfire_func + +======================================================================= + @} */ diff --git a/docs/details/device.dox b/docs/details/device.dox index 230199d583..1aa43e7465 100644 --- a/docs/details/device.dox +++ b/docs/details/device.dox @@ -2,6 +2,22 @@ \addtogroup arrayfire_func @{ +\defgroup device_func_prop deviceInfo +\ingroup device_mat + +\brief Gets the information about device and platform as strings + +\param d_name pointer to a user-allocated char array. Recommended minimum size is 64. +The name of the device is stored in this array. +\param d_platform pointer to a user-allocated char array. Recommended minimum size is 10. +The platform information is stored in this array. +\param d_toolkit pointer to a user-allocated char array. Recommended minimum size is 64. +The toolkit information is stored in this array. +\param d_compute pointer to a user-allocated char array. Recommended minimum size is 10. +The compute version of the device is stored in this array. + +=============================================================================== + \defgroup device_func_count getDeviceCount \ingroup device_mat @@ -62,6 +78,16 @@ allocation =============================================================================== +\defgroup device_func_free free +\ingroup device_mat + +\brief Free device memory allocated by ArrayFire's memory manager + +These calls free the device memory. These functions need to be called on +pointers allocated using alloc function. + +=============================================================================== + \defgroup device_func_pinned pinned \ingroup device_mat @@ -73,12 +99,39 @@ a limited resource. =============================================================================== -\defgroup device_func_free free +\defgroup device_func_free_pinned freePinned \ingroup device_mat -\brief Free device memory allocated by ArrayFire's memory manager +\brief Free pinned memory allocated by ArrayFire's memory manager + +These calls free the pinned memory on host. These functions need to be called on +pointers allocated using pinned function. + +=============================================================================== + +\defgroup device_func_alloc_host allocHost +\ingroup device_mat + +\brief Allocate memory on host + +This function is used for allocating regular memory on host. This is useful +where the compiler version of ArrayFire library is different from the +executable's compiler version. + +It does not use ArrayFire's memory manager. + +=============================================================================== + +\defgroup device_func_free_host freeHost +\ingroup device_mat + +\brief Free memory allocated on host internally by ArrayFire + +This function is used for freeing memory on host that was allocated within +ArrayFire. This is useful where the compiler version of ArrayFire library is +different from the executable's compiler version. -These calls free the device or pinned memory. These functions need to be called +It does not use ArrayFire's memory manager. =============================================================================== diff --git a/docs/details/image.dox b/docs/details/image.dox index 234f4f72e9..ef6d12a4f0 100644 --- a/docs/details/image.dox +++ b/docs/details/image.dox @@ -430,6 +430,12 @@ Save an array to disk as an image Supported formats include JPG, PNG, PPM and other formats supported by freeimage +\defgroup imageio_func_available isImageIoAvailable +\ingroup imageio_mat + +Returns true if ArrayFire was compiled with ImageIO (FreeImage) support + + \defgroup imagemem_func_load loadImageMem \ingroup imageio_mat @@ -501,10 +507,12 @@ grad(dx, dy, in); Resize an input image -Resizing an input image can be done using either \ref AF_INTERP_NEAREST or -\ref AF_INTERP_BILINEAR interpolations. Nearest interpolation will pick the -nearest value to the location, whereas bilinear interpolation will do a -weighted interpolation for calculate the new size. +Resizing an input image can be done using either \ref AF_INTERP_NEAREST, +\ref AF_INTERP_BILINEAR or \ref AF_INTERP_LOWER, interpolations. Nearest +interpolation will pick the nearest value to the location, bilinear +interpolation will do a weighted interpolation for calculate the new size +and lower interpolation is similar to the nearest, except it will use the +floor function to get the lower neighbor. This function does not differentiate between images and data. As long as the array is defined and the output dimensions are not 0, it will resize any @@ -556,10 +564,10 @@ Rotate an input image The angle theta is in radians. -Rotating an input image can be done using either \ref AF_INTERP_NEAREST or -\ref AF_INTERP_BILINEAR interpolations. Nearest interpolation will pick the -nearest value to the location, whereas bilinear interpolation will do a -weighted interpolation for calculate the new size. +Rotating an input image can be done using \ref AF_INTERP_NEAREST, +\ref AF_INTERP_BILINEAR or \ref AF_INTERP_LOWER interpolations. Nearest +interpolation will pick the nearest value to the location, whereas bilinear +interpolation will do a weighted interpolation for calculate the new size. This function does not differentiate between images and data. As long as the array is defined, it will rotate any type or size of array. @@ -659,26 +667,51 @@ Skew is a special case of the \ref af::transform function. Transform an input image -The transform function uses an affine transform matrix to tranform an input +The transform function uses an affine or perspective transform matrix to tranform an input image into a new one. -The transform matrix \p tf is a 3x2 matrix of type float. The matrix operation -is applied to each location (x, y) that is then transformed to (x', y') of the +If matrix \p tf is is a 3x2 matrix, an affine transformation will be performed. The matrix +operation is applied to each location (x, y) that is then transformed to (x', y') of the new array. Hence the transformation is an element-wise operation. -The operation is as below: -tf = [r00 r10 - r01 r11 +The operation is as below:\n +tf = [r00 r10\n + r01 r11\n t0 t1] -x' = x * r00 + y * r01 + t0; +x' = x * r00 + y * r01 + t0;\n y' = x * r10 + y * r11 + t1; -Interpolation types of \ref AF_INTERP_NEAREST and \ref AF_INTERP_BILINEAR are allowed. +If matrix \p tf is is a 3x3 matrix, a perspective transformation will be performed. + +The operation is as below:\n +tf = [r00 r10 r20\n + r01 r11 r21\n + t0 t1 t2] + +x' = (x * r00 + y * r01 + t0) / (x * r20 + y * r21 + t2);\n +y' = (x * r10 + y * r11 + t1) / (x * r20 + y * r21 + t2); + +The transformation matrix \p tf should always be of type f32. + +Interpolation types of \ref AF_INTERP_NEAREST, \ref AF_INTERP_BILINEAR and +AF_INTERP_LOWER are allowed. Affine transforms can be used for various purposes. \ref af::translate, \ref af::scale and \ref af::skew are specializations of the transform function. + +\defgroup transform_func_coordinates transformcoordinates +\ingroup transform_mat + +Transform input coordinates + +The transform function uses a perspective transform matrix to transform input +coordinates (given as two dimensions) into a coordinates matrix. + +The output is a 4x2 matrix, indicating the coordinates of the 4 bidimensional +transformed points. + ======================================================================= \defgroup image_func_sat SAT diff --git a/docs/details/internal.dox b/docs/details/internal.dox new file mode 100644 index 0000000000..5ac06422ca --- /dev/null +++ b/docs/details/internal.dox @@ -0,0 +1,29 @@ +/** +\addtogroup internal_func +@{ + +\defgroup internal_func_create createStridedArray + +Create an array with specified strides and offset. + + +\defgroup internal_func_strides getStrides + +Get strides of underlying data. + + +\defgroup internal_func_offset getOffset + +Get Offset of the underlying data. + + +\defgroup internal_func_linear isLinear + +Check if all elements in array are contiguous. + +\defgroup internal_func_owner isOwner + +Check if underlying data is owned by the current array. + +@} +*/ diff --git a/docs/details/lapack.dox b/docs/details/lapack.dox index c0d8aae5b9..522dbe544f 100644 --- a/docs/details/lapack.dox +++ b/docs/details/lapack.dox @@ -287,5 +287,13 @@ This function can return the norm using various metrics based on the type paramt =============================================================================== +\defgroup lapack_helper_func_available isLAPACKAvailable + +\ingroup lapack_helper + +\brief Returns true is ArrayFire is compiled with LAPACK support + +=============================================================================== + @} */ diff --git a/docs/details/vision.dox b/docs/details/vision.dox index 1d9d6b99ac..99582c3729 100644 --- a/docs/details/vision.dox +++ b/docs/details/vision.dox @@ -166,9 +166,12 @@ from the other and returns the result. \brief Template Matching -Template matching is an image processing technique to find small patches of an image which -match a given template image. A more in depth discussion on the topic can be found -[here](http://en.wikipedia.org/wiki/Template_matching). +Template matching is an image processing technique to find small patches of an image which match a given template image. Currently, this function doesn't support the following three metrics yet. +- \ref AF_NCC +- \ref AF_ZNCC +- \ref AF_SHD + +A more in depth discussion about template matching can be found [here](http://en.wikipedia.org/wiki/Template_matching). ======================================================================= diff --git a/docs/pages/INSTALL.md b/docs/pages/INSTALL.md index 3565889571..d31affaefe 100644 --- a/docs/pages/INSTALL.md +++ b/docs/pages/INSTALL.md @@ -108,13 +108,14 @@ First install the prerequisite packages: # Prerequisite packages: sudo apt-get install libfreeimage-dev libatlas3gf-base libfftw3-dev cmake -Ubuntu 14.04 will not have the libglfw3-dev package in its repositories. You can either build the library from source (following the instructions listed) or install the library from a PPA as follows: - -``` -sudo apt-add repository ppa:keithw/glfw3 -sudo apt-get update -sudo apt-get install glfw3 -``` +Ubuntu 14.04 will not have the libglfw3-dev package in its repositories. You can either build the +library from source (following the +[instructions listed here](https://github.com/arrayfire/arrayfire/wiki/GLFW-for-ArrayFire)) or +install the library from a PPA as follows: + + sudo apt-add-repository ppa:keithw/glfw3 + sudo apt-get update + sudo apt-get install glfw3 After this point, the installation should proceed identically to Ubuntu 14.10 or newer. diff --git a/docs/pages/README.md b/docs/pages/README.md index 302690242e..8a395a70af 100644 --- a/docs/pages/README.md +++ b/docs/pages/README.md @@ -76,7 +76,7 @@ Each ArrayFire installation comes with: ArrayFire supports batched operations on N-dimensional arrays. Batch operations in ArrayFire are run in parallel ensuring an optimal usage of your CUDA or OpenCL device. -You can get the best performance out of ArrayFire using [vectorization techniques](). +You can get the best performance out of ArrayFire using [vectorization techniques](\ref vectorization). ArrayFire can also execute loop iterations in parallel with [the gfor function](\ref gfor). @@ -92,8 +92,8 @@ Read more about how [ArrayFire JIT](http://arrayfire.com/performance-of-arrayfir ## Simple Example -Here's a live example to let you see ArrayFire code. You create [arrays](\ref -construct_mat) which reside on CUDA or OpenCL devices. Then you can use +Here's a live example to let you see ArrayFire code. You create [arrays](\ref construct_mat) +which reside on CUDA or OpenCL devices. Then you can use [ArrayFire functions](modules.htm) on those [arrays](\ref construct_mat). ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} diff --git a/docs/pages/configuring_arrayfire_environment.md b/docs/pages/configuring_arrayfire_environment.md index 054068e224..d554046f1e 100644 --- a/docs/pages/configuring_arrayfire_environment.md +++ b/docs/pages/configuring_arrayfire_environment.md @@ -18,6 +18,16 @@ This is the path with ArrayFire gets installed, ie. the includes and libs are present in this directory. You can use this variable to add include paths and libraries to your projects. +AF_PRINT_ERRORS {#af_print_errors} +------------------------------------------------------------------------------- + +When AF_PRINT_ERRORS is set to 1, the exceptions thrown are more verbose and +detailed. This helps in locating the exact failure. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +AF_PRINT_ERRORS=1 ./myprogram +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + AF_CUDA_DEFAULT_DEVICE {#af_cuda_default_device} ------------------------------------------------------------------------------- @@ -44,25 +54,116 @@ AF_OPENCL_DEFAULT_DEVICE=1 ./myprogram_opencl Note: af::setDevice call in the source code will take precedence over this variable. +AF_OPENCL_DEFAULT_DEVICE_TYPE {#af_opencl_default_device_type} +------------------------------------------------------------------------------- + +Use this variable to set the default OpenCL device type. Valid values for this +variable are: CPU, GPU, ACC (Accelerators). + +When set, the first device of the specified type is chosen as default device. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +AF_OPENCL_DEFAULT_DEVICE_TYPE=CPU ./myprogram_opencl +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Note: `AF_OPENCL_DEFAULT_DEVICE` and af::setDevice takes precedence over this variable. + +AF_OPENCL_DEVICE_TYPE {#af_opencl_device_type} +------------------------------------------------------------------------------- + +Use this variable to only choose OpenCL devices of specified type. Valid values for this +variable are: + +- ALL: All OpenCL devices. (Default behavior). +- CPU: CPU devices only. +- GPU: GPU devices only. +- ACC: Accelerator devices only. + +When set, the remaining OpenCL device types are ignored by the OpenCL backend. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +AF_OPENCL_DEVICE_TYPE=CPU ./myprogram_opencl +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +AF_OPENCL_CPU_OFFLOAD {#af_opencl_cpu_offload} +------------------------------------------------------------------------------- + +When this variable is set to 1, and the selected OpenCL device has unified +memory with the host (ie. `CL_DEVICE_HOST_UNIFIED_MEMORY` is true for device), +then certain functions are offloaded to run on the CPU using mapped buffers. + +This takes advantage of fast libraries such as MKL while spending no time +copying memory from device to host. The device memory is mapped to a host +pointer which can be used in the offloaded functions. + +AF_OPENCL_SHOW_BUILD_INFO {#af_opencl_show_build_info} +------------------------------------------------------------------------------- + +This variable is useful when debuggin OpenCL kernel compilation failures. When +this variable is set to 1, and an error occurs during a OpenCL kernel +compilation, then the log and kernel are printed to screen. + AF_DISABLE_GRAPHICS {#af_disable_graphics} ------------------------------------------------------------------------------- -Setting this variable will disable window creation when graphics functions are -being called. Simply setting this variable will disable functionality, any -value will suffice. Disabling window creation will disable all other graphics -calls at runtime as well. +Setting this variable to 1 will disable window creation when graphics +functions are being called. Disabling window creation will disable all other +graphics calls at runtime as well. This is a useful enviornment variable when running code on servers and systems without displays. When graphics calls are run on such machines, they will print warning about window creation failing. To suppress those calls, set this variable. -AF_PRINT_ERRORS {#af_print_errors} +AF_SYNCHRONOUS_CALLS {#af_synchronous_calls} ------------------------------------------------------------------------------- -When AF_PRINT_ERRORS is set to 1, the exceptions thrown are more verbose and -detailed. This helps in locating the exact failure. +When this environment variable is set to 1, ArrayFire will execute all +functions synchronously. + +AF_SHOW_LOAD_PATH {#af_show_load_path} +------------------------------------------------------------------------------- + +When using the Unified backend, if this variable is set to 1, it will show the +path where the ArrayFire backend libraries are loaded from. + +If the libraries are loaded from system paths, such as PATH or LD_LIBRARY_PATH +etc, then it will print "system path". If the libraries are loaded from other +paths, then those paths are shown in full. + +AF_MEM_DEBUG {#af_mem_debug} +------------------------------------------------------------------------------- + +When AF_MEM_DEBUG is set to 1 (or anything not equal to 0), the caching mechanism in the memory manager. +The device buffers are allocated using native functions as needed and freed when going out of scope. + +When the environment variable is not set, it is treated to be non zero. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -AF_PRINT_ERRORS=1 ./myprogram_opencl +AF_MEM_DEBUG=1 ./myprogram ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +AF_MAX_BUFFERS {#af_max_buffers} +------------------------------------------------------------------------- + +When AF_MAX_BUFFERS is set, this environment variable specifies the maximum number of buffers allocated before garbage collection kicks in. + +Please note that the total number of buffers that can exist simultaneously can be higher than this number. This variable tells the garbage collector that it should free any available buffers immediately if the treshold is reached. + +When not set, the default value is 1000. + +AF_OPENCL_MAX_JIT_LEN {#af_opencl_max_jit_len} +------------------------------------------------------------------------------- + +When set, this environment variable specifies the maximum length of the OpenCL JIT tree after which evaluation is forced. The default value for this is 16 for AMD devices and 20 otherwise. + +AF_CUDA_MAX_JIT_LEN {#af_cuda_max_jit_len} +------------------------------------------------------------------------------- + +When set, this environment variable specifies the maximum length of the CUDA JIT tree after which evaluation is forced. The default value for this is 20. + +AF_CPU_MAX_JIT_LEN {#af_cpu_max_jit_len} +------------------------------------------------------------------------------- + +When set, this environment variable specifies the maximum length of the CPU JIT tree after which evaluation is forced. The default value for this is 20. diff --git a/docs/pages/release_notes.md b/docs/pages/release_notes.md index 4f13cc7434..738d2b0a4f 100644 --- a/docs/pages/release_notes.md +++ b/docs/pages/release_notes.md @@ -1,6 +1,128 @@ Release Notes {#releasenotes} ============== +v3.3.0 +============== + +Major Updates +------------- + +* CPU backend supports aysnchronous execution. +* Performance improvements to OpenCL BLAS and FFT functions. +* Improved performance of memory manager. +* Improvements to visualization functions. +* Improved sorted order for OpenCL devices. +* Integration with external OpenCL projects. + +Features +---------- + +* \ref af::getActiveBackend(): Returns the current backend being used. +* [Scatter plot](https://github.com/arrayfire/arrayfire/pull/1116) added to graphics. +* \ref af::transform() now supports perspective transformation matrices. +* \ref af::infoString(): Returns `af::info()` as a string. +* \ref af::printMemInfo(): Print a table showing information about buffer from the memory manager + * The \ref AF_MEM_INFO macro prints numbers and total sizes of all buffers (requires including af/macros.h) +* \ref af::allocHost(): Allocates memory on host. +* \ref af::freeHost(): Frees host side memory allocated by arrayfire. +* OpenCL functions can now use CPU implementation. + * Currently limited to Unified Memory devices (CPU and On-board Graphics). + * Functions: af::matmul() and all [LAPACK](\ref linalg_mat) functions. + * Takes advantage of optimized libraries such as MKL without doing memory copies. + * Use the environment variable `AF_OPENCL_CPU_OFFLOAD=1` to take advantage of this feature. +* Functions specific to OpenCL backend. + * \ref afcl::addDevice(): Adds an external device and context to ArrayFire's device manager. + * \ref afcl::deleteDevice(): Removes an external device and context from ArrayFire's device manager. + * \ref afcl::setDevice(): Sets an external device and context from ArrayFire's device manager. + * \ref afcl::getDeviceType(): Gets the device type of the current device. + * \ref afcl::getPlatform(): Gets the platform of the current device. +* \ref af::createStridedArray() allows [array creation user-defined strides](https://github.com/arrayfire/arrayfire/issues/1177) and device pointer. +* [Expose functions](https://github.com/arrayfire/arrayfire/issues/1131) that provide information + about memory layout of Arrays. + * \ref af::getStrides(): Gets the strides for each dimension of the array. + * \ref af::getOffset(): Gets the offsets for each dimension of the array. + * \ref af::getRawPtr(): Gets raw pointer to the location of the array on device. + * \ref af::isLinear(): Returns true if all elements in the array are contiguous. + * \ref af::isOwner(): Returns true if the array owns the raw pointer, false if it is a sub-array. + * \ref af::getStrides(): Gets the strides of the array. + * \ref af::getStrides(): Gets the strides of the array. +* \ref af::getDeviceId(): Gets the device id on which the array resides. +* \ref af::isImageIOAvailable(): Returns true if ArrayFire was compiled with Freeimage enabled +* \ref af::isLAPACKAvailable(): Returns true if ArrayFire was compiled with LAPACK functions enabled + +Bug Fixes +-------------- + +* Fixed [errors when using 3D / 4D arrays](https://github.com/arrayfire/arrayfire/pull/1251) in select and replace +* Fixed [JIT errors on AMD devices](https://github.com/arrayfire/arrayfire/pull/1238) for OpenCL backend. +* Fixed [imageio bugs](https://github.com/arrayfire/arrayfire/pull/1229) for 16 bit images. +* Fixed [bugs when loading and storing images](https://github.com/arrayfire/arrayfire/pull/1228) natively. +* Fixed [bug in FFT for NVIDIA GPUs](https://github.com/arrayfire/arrayfire/issues/615) when using OpenCL backend. +* Fixed [bug when using external context](https://github.com/arrayfire/arrayfire/pull/1241) with OpenCL backend. +* Fixed [memory leak](https://github.com/arrayfire/arrayfire/issues/1269) in \ref af_median_all(). +* Fixed [memory leaks and performance](https://github.com/arrayfire/arrayfire/pull/1274) in graphics functions. +* Fixed [bugs when indexing followed by moddims](https://github.com/arrayfire/arrayfire/issues/1275). +* \ref af_get_revision() now returns actual commit rather than AF_REVISION. +* Fixed [releasing arrays](https://github.com/arrayfire/arrayfire/issues/1282) when using different backends. +* OS X OpenCL: [LAPACK functions](\ref linalg_mat) on CPU devices use OpenCL offload (previously threw errors). +* [Add support for 32-bit integer image types](https://github.com/arrayfire/arrayfire/pull/1287) in Image IO. +* Fixed [set operations for row vectors](https://github.com/arrayfire/arrayfire/issues/1300) +* Fixed [bugs](https://github.com/arrayfire/arrayfire/issues/1243) in \ref af::meanShift() and af::orb(). + +Improvements +-------------- + +* Optionally [offload BLAS and LAPACK](https://github.com/arrayfire/arrayfire/pull/1221) functions to CPU implementations to improve performance. +* Performance improvements to the memory manager. +* Error messages are now more detailed. +* Improved sorted order for OpenCL devices. +* JIT heuristics can now be tweaked using environment variables. See + [Environment Variables](\ref configuring_environment) tutorial. +* Add `BUILD_` [options to examples and tests](https://github.com/arrayfire/arrayfire/issues/1286) + to toggle backends when compiling independently. + +Examples +---------- + +* New visualization [example simulating gravity](\ref graphics/gravity_sim.cpp). + +Build +---------- + +* Support for Intel `icc` compiler +* Support to compile with Intel MKL as a BLAS and LAPACK provider +* Tests are now available for building as standalone (like examples) +* Tests can now be built as a single file for each backend +* Better handling of NONFREE build options +* [Searching for GLEW in CMake default paths](https://github.com/arrayfire/arrayfire/pull/1292) +* Fixes for compiling with MKL on OSX. + +Installers +---------- +* Improvements to OSX Installer + * CMake config files are now installed with libraries + * Independent options for installing examples and documentation components + +Deprecations +----------- + +* `af_lock_device_arr` is now deprecated to be removed in v4.0.0. Use \ref af_lock_array() instead. +* `af_unlock_device_arr` is now deprecated to be removed in v4.0.0. use \ref af_unlock_array() instead. + +Documentation +-------------- + +* Fixes to documentation for \ref matchTemplate(). +* Improved documentation for deviceInfo. +* Fixes to documentation for \ref exp(). + +Known Issues +------------ + +* [Solve OpenCL fails on NVIDIA Maxwell devices](https://github.com/arrayfire/arrayfire/issues/1246) + for f32 and c32 when M > N and K % 4 is 1 or 2. + + v3.2.2 ============== diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4710d1b739..be0f6407be 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -61,13 +61,17 @@ ENDMACRO() # and TARGET_LINK_LIBRARIES(... ${ARRAYFIRE_LIBRARIES}) are needed MACRO(BUILD_ALL FILES BACKEND_NAME BACKEND_LIBRARIES OTHER_LIBRARIES) - FOREACH(FILE ${FILES}) - GET_FILENAME_COMPONENT(EXAMPLE ${FILE} NAME_WE) - GET_FILENAME_COMPONENT(FULL_DIR_NAME ${FILE} PATH) - GET_FILENAME_COMPONENT(DIR_NAME ${FULL_DIR_NAME} NAME) + STRING(TOUPPER ${BACKEND_NAME} BACKEND_NAME_UPPER) + MESSAGE(STATUS "EXAMPLES: ${BACKEND_NAME_UPPER} backend is ${BUILD_${BACKEND_NAME_UPPER}}.") + IF(${BUILD_${BACKEND_NAME_UPPER}}) + FOREACH(FILE ${FILES}) + GET_FILENAME_COMPONENT(EXAMPLE ${FILE} NAME_WE) + GET_FILENAME_COMPONENT(FULL_DIR_NAME ${FILE} PATH) + GET_FILENAME_COMPONENT(DIR_NAME ${FULL_DIR_NAME} NAME) - BUILD_EXAMPLE(${EXAMPLE} ${FILE} ${BACKEND_NAME} "${BACKEND_LIBRARIES}" "${OTHER_LIBRARIES}" ${DIR_NAME}) - ENDFOREACH() + BUILD_EXAMPLE(${EXAMPLE} ${FILE} ${BACKEND_NAME} "${BACKEND_LIBRARIES}" "${OTHER_LIBRARIES}" ${DIR_NAME}) + ENDFOREACH() + ENDIF() ENDMACRO() # Collect the source @@ -76,10 +80,9 @@ ADD_DEFINITIONS("-DASSETS_DIR=\"${ASSETS_DIR}\"") # Next we build each example using every backend. IF(${ArrayFire_CPU_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "EXAMPLES: CPU backend is ON.") + OPTION(BUILD_CPU "Build ArrayFire Examples for CPU backend" ON) BUILD_ALL("${FILES}" cpu ${ArrayFire_CPU_LIBRARIES} "") ELSEIF(TARGET afcpu) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "EXAMPLES: CPU backend is ON.") BUILD_ALL("${FILES}" cpu afcpu "") ELSE() MESSAGE(STATUS "EXAMPLES: CPU backend is OFF. afcpu was not found.") @@ -87,10 +90,9 @@ ENDIF() # Next we build each example using every backend. IF(${ArrayFire_Unified_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "EXAMPLES: UNIFIED backend is ON.") + OPTION(BUILD_UNIFIED "Build ArrayFire Examples for Unified backend" ON) BUILD_ALL("${FILES}" unified ${ArrayFire_Unified_LIBRARIES} "${CMAKE_DL_LIBS}") ELSEIF(TARGET af) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "EXAMPLES: UNIFIED backend is ON.") BUILD_ALL("${FILES}" unified af "${CMAKE_DL_LIBS}") ELSE() MESSAGE(STATUS "EXAMPLES: UNIFIED backend is OFF. af was not found.") @@ -104,10 +106,10 @@ IF (${CUDA_FOUND}) PATHS ${CUDA_TOOLKIT_ROOT_DIR} DOC "CUDA NVVM Library" ) - MESSAGE(STATUS "EXAMPLES: CUDA backend is ON.") + MARK_AS_ADVANCED(CUDA_NVVM_LIBRARY) + OPTION(BUILD_CUDA "Build ArrayFire Examples for CUDA backend" ON) BUILD_ALL("${FILES}" cuda ${ArrayFire_CUDA_LIBRARIES} "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") ELSEIF(TARGET afcuda) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "EXAMPLES: CUDA backend is ON.") BUILD_ALL("${FILES}" cuda afcuda "") ELSE() MESSAGE(STATUS "EXAMPLES: CUDA backend is OFF. afcuda was not found") @@ -118,10 +120,9 @@ ENDIF() IF (${OpenCL_FOUND}) IF(${ArrayFire_OpenCL_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "EXAMPLES: OpenCL backend is ON.") + OPTION(BUILD_OPENCL "Build ArrayFire Examples for OpenCL backend" ON) BUILD_ALL("${FILES}" opencl ${ArrayFire_OpenCL_LIBRARIES} "${OpenCL_LIBRARIES}") ELSEIF(TARGET afopencl) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "EXAMPLES: OpenCL backend is ON.") BUILD_ALL("${FILES}" opencl afopencl "${OpenCL_LIBRARIES}") ELSE() MESSAGE(STATUS "EXAMPLES: OpenCL backend is OFF. afopencl was not found") diff --git a/examples/graphics/fractal.cpp b/examples/graphics/fractal.cpp index 9ac5a86ea9..9781b61c90 100644 --- a/examples/graphics/fractal.cpp +++ b/examples/graphics/fractal.cpp @@ -10,13 +10,14 @@ #include #include #include -#include +#include #include #define WIDTH 400 // Width of image #define HEIGHT 400 // Width of image using namespace af; +using std::abs; array complex_grid(int width, int height, float zoom, float center[2]) { diff --git a/examples/graphics/gravity_sim.cpp b/examples/graphics/gravity_sim.cpp new file mode 100644 index 0000000000..3fc19d8c65 --- /dev/null +++ b/examples/graphics/gravity_sim.cpp @@ -0,0 +1,140 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include + +using namespace af; +using namespace std; + +static const int width = 512, height = 512; +static const int pixels_per_unit = 20; + +void simulate(af::array *pos, af::array *vels, af::array *forces, float dt){ + pos[0] += vels[0] * pixels_per_unit * dt; + pos[1] += vels[1] * pixels_per_unit * dt; + + //calculate distance to center + af::array diff_x = pos[0] - width/2; + af::array diff_y = pos[1] - height/2; + af::array dist = sqrt( diff_x*diff_x + diff_y*diff_y ); + + //calculate normalised force vectors + forces[0] = -1 * diff_x / dist; + forces[1] = -1 * diff_y / dist; + //update force scaled to time and magnitude constant + forces[0] *= pixels_per_unit * dt; + forces[1] *= pixels_per_unit * dt; + + //dampening + vels[0] *= 1 - (0.005*dt); + vels[1] *= 1 - (0.005*dt); + + //update velocities from forces + vels[0] += forces[0]; + vels[1] += forces[1]; + +} + +void collisions(af::array *pos, af::array *vels){ + //clamp particles inside screen border + af::array projected_px = min(width, max(0, pos[0])); + af::array projected_py = min(height - 1, max(0, pos[1])); + + //calculate distance to center + af::array diff_x = projected_px - width/2; + af::array diff_y = projected_py - height/2; + af::array dist = sqrt( diff_x*diff_x + diff_y*diff_y ); + + //collide with center sphere + const int radius = 50; + const float elastic_constant = 0.91f; + if(sum(dist 0) { + vels[0](dist using namespace af; +using std::abs; typedef enum { MEAN = 0, diff --git a/examples/image_processing/brain_segmentation.cpp b/examples/image_processing/brain_segmentation.cpp index 7349bf258b..253d37e5f1 100644 --- a/examples/image_processing/brain_segmentation.cpp +++ b/examples/image_processing/brain_segmentation.cpp @@ -23,10 +23,12 @@ const float h_sy_kernel[] = { -1, 0, 1, -2, 0, 2, -1, 0, 1 }; -const float h_lp_kernel[] = { -0.5f, -1.0f, -0.5f, - -1.0f, 6.0f, -1.0f, - -0.5f, -1.0f, -0.5f -}; + +// Unused +//const float h_lp_kernel[] = { -0.5f, -1.0f, -0.5f, +// -1.0f, 6.0f, -1.0f, +// -0.5f, -1.0f, -0.5f +//}; array edges_slice(array x) { diff --git a/examples/image_processing/filters.cpp b/examples/image_processing/filters.cpp index 8b75acf063..ae1d7c155c 100644 --- a/examples/image_processing/filters.cpp +++ b/examples/image_processing/filters.cpp @@ -151,7 +151,7 @@ array medianfilter(const array &in, int window_width, int window_height) return ret_val; } -array gaussianblur(const array &in, int window_width, int window_height, int sigma) +array gaussianblur(const array &in, int window_width, int window_height, double sigma) { array g = gaussianKernel(window_width, window_height, sigma, sigma); return convolve(in, g); diff --git a/include/af/array.h b/include/af/array.h index 03f3eeb23a..de746d9384 100644 --- a/include/af/array.h +++ b/include/af/array.h @@ -672,6 +672,8 @@ namespace af Get the device pointer from the array and lock the buffer in memory manager. @{ + The device memory returned by this function is not freed until unlock() is called. + \ingroup arrayfire_func \ingroup device_mat */ @@ -961,7 +963,7 @@ namespace af /// \brief Locks the device buffer in the memory manager. /// /// This method can be called to take control of the device pointer from the memory manager. - /// While a buffer is locked, the memory manager does not free the memory. + /// While a buffer is locked, the memory manager doesn't free the memory until unlock() is invoked. void lock() const; /// diff --git a/include/af/backend.h b/include/af/backend.h index 93d8d8de58..94c4951d45 100644 --- a/include/af/backend.h +++ b/include/af/backend.h @@ -55,6 +55,29 @@ AFAPI af_err af_get_available_backends(int* backends); AFAPI af_err af_get_backend_id(af_backend *backend, const af_array in); #endif +#if AF_API_VERSION >= 33 +/** + \param[out] backend takes one of the values of enum \ref af_backend + from the backend that is currently set to active + \returns \ref af_err error code + + \ingroup unified_func_getactivebackend + */ +AFAPI af_err af_get_active_backend(af_backend *backend); +#endif + +#if AF_API_VERSION >= 33 +/** + \param[out] device contains the device on which \p in was created. + \param[in] in is the array who's device is to be queried. + \returns \ref af_err error code + + \ingroup unified_func_getdeviceid + */ +AFAPI af_err af_get_device_id(int *device, const af_array in); +#endif + + #ifdef __cplusplus } #endif @@ -101,5 +124,26 @@ AFAPI int getAvailableBackends(); AFAPI af::Backend getBackendId(const array &in); #endif +#if AF_API_VERSION >= 33 +/** + \returns \ref af_backend which is the backend is currently active + + \ingroup unified_func_getctivebackend + */ +AFAPI af::Backend getActiveBackend(); +#endif + +#if AF_API_VERSION >= 33 +/** + \param[in] in is the array who's device is to be queried. + \returns The id of the device on which this array was created. + + \note Device ID can be the same for arrays belonging to different backends. + + \ingroup unified_func_getdeviceid + */ +AFAPI int getDeviceId(const array &in); +#endif + } #endif diff --git a/include/af/defines.h b/include/af/defines.h index a25d23996d..77508f2870 100644 --- a/include/af/defines.h +++ b/include/af/defines.h @@ -120,6 +120,13 @@ typedef enum { AF_ERR_BATCH = 207, +#if AF_API_VERSION >= 33 + /// + /// Input does not belong to the current device. + /// + AF_ERR_DEVICE = 208, +#endif + // 300-399 Errors for missing software features /// @@ -378,6 +385,19 @@ typedef enum { AF_ID = 0 } af_someenum_t; +#if AF_API_VERSION >=32 +typedef enum { + AF_MARKER_NONE = 0, + AF_MARKER_POINT = 1, + AF_MARKER_CIRCLE = 2, + AF_MARKER_SQUARE = 3, + AF_MARKER_TRIANGLE = 4, + AF_MARKER_CROSS = 5, + AF_MARKER_PLUS = 6, + AF_MARKER_STAR = 7 +} af_marker_type; +#endif + #ifdef __cplusplus namespace af { @@ -404,6 +424,9 @@ namespace af #if AF_API_VERSION >= 32 typedef af_backend Backend; #endif +#if AF_API_VERSION >= 32 + typedef af_marker_type markerType; +#endif } #endif diff --git a/include/af/device.h b/include/af/device.h index 826863e6d8..b08bd519b3 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -29,20 +29,33 @@ namespace af */ /** - \defgroup device_func_prop deviceInfo + \defgroup device_func_info_string infoString - Get device information + Get af::info() as a string @{ + \brief Returns the output of af::info() as a string + + \param[in] verbose flag to return verbose info + + \returns string containing output of af::info() + \ingroup arrayfire_func \ingroup device_mat */ - AFAPI void deviceInfo(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); + AFAPI const char* infoString(const bool verbose = false); /** @} */ + /** + \copydoc device_func_prop + + \ingroup device_func_prop + */ + AFAPI void deviceInfo(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); + /// \brief Gets the number of devices /// /// \copydoc device_func_count @@ -87,6 +100,8 @@ namespace af /// \param[in] type is the type of the elements to allocate /// \returns the pointer to the memory /// + /// \note The device memory returned by this function is only freed if af::free() is called explicitly + AFAPI void *alloc(const size_t elements, const dtype type); /// \brief Allocates memory using ArrayFire's memory manager @@ -97,10 +112,20 @@ namespace af /// /// \note the size of the memory allocated is the number of \p elements * /// sizeof(type) + /// + /// \note The device memory returned by this function is only freed if af::free() is called explicitly template T* alloc(const size_t elements); /// @} + /// \ingroup device_func_free + /// + /// \copydoc device_func_free + /// \param[in] ptr the memory to free + /// + /// This function will free a device pointer even if it has been previously locked. + AFAPI void free(const void *ptr); + /// \ingroup device_func_pinned /// @{ /// @@ -119,15 +144,51 @@ namespace af T* pinned(const size_t elements); /// @} - /// \ingroup device_func_free - /// @{ - /// \copydoc device_func_free + /// \ingroup device_func_free_pinned + /// + /// \copydoc device_func_free_pinned /// \param[in] ptr the memory to free - AFAPI void free(const void *ptr); - - /// \copydoc free() AFAPI void freePinned(const void *ptr); - ///@} + +#if AF_API_VERSION >= 33 + /// \brief Allocate memory on host + /// + /// \copydoc device_func_alloc_host + /// + /// \param[in] elements the number of elements to allocate + /// \param[in] type is the type of the elements to allocate + /// \returns the pointer to the memory + /// + /// \ingroup device_func_alloc_host + AFAPI void *allocHost(const size_t elements, const dtype type); +#endif + +#if AF_API_VERSION >= 33 + /// \brief Allocate memory on host + /// + /// \copydoc device_func_alloc_host + /// + /// \param[in] elements the number of elements to allocate + /// \returns the pointer to the memory + /// + /// \note the size of the memory allocated is the number of \p elements * + /// sizeof(type) + /// + /// \ingroup device_func_alloc_host + template + AFAPI T* allocHost(const size_t elements); +#endif + +#if AF_API_VERSION >= 33 + /// \brief Free memory allocated internally by ArrayFire + // + /// \copydoc device_func_free_host + /// + /// \param[in] ptr the memory to free + /// + /// \ingroup device_func_free_host + AFAPI void freeHost(const void *ptr); +#endif /// \ingroup device_func_mem /// @{ @@ -139,9 +200,25 @@ namespace af // manager /// \param[out] lock_bytes The number of bytes in use /// \param[out] lock_buffers The number of buffers in use + /// + /// \note This function performs a synchronization operation AFAPI void deviceMemInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); +#if AF_API_VERSION >= 33 + /// + /// Prints buffer details from the ArrayFire Device Manager + // + /// \param [in] msg A message to print before the table + /// \param [in] device_id print the memory info of the specified device. + /// -1 signifies active device. + // + /// \ingroup device_func_mem + /// + /// \note This function performs a synchronization operation + AFAPI void printMemInfo(const char *msg = NULL, const int device_id = -1); +#endif + /// \brief Call the garbage collection function in the memory manager /// /// \ingroup device_func_mem @@ -169,10 +246,25 @@ extern "C" { */ AFAPI af_err af_info(); + /** + \ingroup device_func_info + */ AFAPI af_err af_init(); /** - \ingroup device_func_info + \brief Gets the output of af_info() as a string + + \param[out] str contains the string + \param[in] verbose flag to return verbose info + + \ingroup device_func_info_string + */ + AFAPI af_err af_info_string(char** str, const bool verbose); + + /** + \copydoc device_func_prop + + \ingroup device_func_prop */ AFAPI af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); @@ -203,24 +295,42 @@ extern "C" { /** \ingroup device_func_alloc + + This device memory returned by this function can only be freed using af_free_device */ AFAPI af_err af_alloc_device(void **ptr, const dim_t bytes); /** - \ingroup device_func_pinned + \ingroup device_func_free + + This function will free a device pointer even if it has been previously locked. */ - AFAPI af_err af_alloc_pinned(void **ptr, const dim_t bytes); + AFAPI af_err af_free_device(void *ptr); /** - \ingroup device_func_free + \ingroup device_func_pinned */ - AFAPI af_err af_free_device(void *ptr); + AFAPI af_err af_alloc_pinned(void **ptr, const dim_t bytes); /** \ingroup device_func_free_pinned */ AFAPI af_err af_free_pinned(void *ptr); +#if AF_API_VERSION >= 33 + /** + \ingroup device_func_alloc_host + */ + AFAPI af_err af_alloc_host(void **ptr, const dim_t bytes); +#endif + +#if AF_API_VERSION >= 33 + /** + \ingroup device_func_free_host + */ + AFAPI af_err af_free_host(void *ptr); +#endif + /** Create array from device memory \ingroup construct_mat @@ -234,6 +344,21 @@ extern "C" { AFAPI af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); +#if AF_API_VERSION >= 33 + /// + /// Prints buffer details from the ArrayFire Device Manager + // + /// \param [in] msg A message to print before the table + /// \param [in] device_id print the memory info of the specified device. + /// -1 signifies active device. + /// + /// return AF_SUCCESS if successful + /// + /// \ingroup device_func_mem + /// + AFAPI af_err af_print_mem_info(const char *msg, const int device_id); +#endif + /** Call the garbage collection routine \ingroup device_func_mem @@ -256,9 +381,12 @@ extern "C" { /** Lock the device buffer in the memory manager. - Locked buffers are not freed by memory manager until \ref af_unlock_device_ptr is called. + Locked buffers are not freed by memory manager until \ref af_unlock_array is called. \ingroup device_func_mem */ +#if AF_API_VERSION >= 33 + DEPRECATED("Use af_lock_array instead") +#endif AFAPI af_err af_lock_device_ptr(const af_array arr); #endif @@ -269,9 +397,32 @@ extern "C" { This function will give back the control over the device pointer to the memory manager. \ingroup device_func_mem */ +#if AF_API_VERSION >= 33 + DEPRECATED("Use af_unlock_array instead") +#endif AFAPI af_err af_unlock_device_ptr(const af_array arr); #endif +#if AF_API_VERSION >= 33 + /** + Lock the device buffer in the memory manager. + + Locked buffers are not freed by memory manager until \ref af_unlock_array is called. + \ingroup device_func_mem + */ + AFAPI af_err af_lock_array(const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + Unlock device buffer in the memory manager. + + This function will give back the control over the device pointer to the memory manager. + \ingroup device_func_mem + */ + AFAPI af_err af_unlock_array(const af_array arr); +#endif + /** Get the device pointer and lock the buffer in memory manager. diff --git a/include/af/exception.h b/include/af/exception.h index ee10c5db7b..a43d26dbaa 100644 --- a/include/af/exception.h +++ b/include/af/exception.h @@ -27,6 +27,9 @@ class AFAPI exception : public std::exception exception(const char *msg); exception(const char *file, unsigned line, af_err err); exception(const char *msg, const char *file, unsigned line, af_err err); +#if AF_API_VERSION >= 33 + exception(const char *msg, const char *func, const char *file, unsigned line, af_err err); +#endif virtual ~exception() throw() {} virtual const char *what() const throw() { return m_msg; } friend inline std::ostream& operator<<(std::ostream &s, const exception &e) diff --git a/include/af/graphics.h b/include/af/graphics.h index 17cb622383..b69a83854a 100644 --- a/include/af/graphics.h +++ b/include/af/graphics.h @@ -30,6 +30,8 @@ namespace af \brief Window object to render af::arrays + Windows are not CopyConstructible or CopyAssignable. + \ingroup graphics_func */ class AFAPI Window { @@ -43,6 +45,9 @@ class AFAPI Window { void initWindow(const int width, const int height, const char* const title); + Window(const Window&); // Prevent copy-construction + Window& operator=(const Window&); // Prevent assignment + public: /** Creates a window object with default width @@ -84,6 +89,7 @@ class AFAPI Window { \ingroup gfx_func_window */ Window(const af_window wnd); + /** Destroys the window handle @@ -177,9 +183,39 @@ class AFAPI Window { \ingroup gfx_func_draw */ - void plot(const array& X, const array& Y, const char* const title=NULL); +#if AF_API_VERSION >= 33 + /** + Renders the input arrays as a 2D scatter-plot to the window + + \param[in] X is an \ref array with the x-axis data points + \param[in] Y is an \ref array with the y-axis data points + \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot + \param[in] title parameter is used when this function is called in grid mode + + \note \p X and \p Y should be vectors. + + \ingroup gfx_func_draw + */ + void scatter(const array& X, const array& Y, + const af::markerType marker = AF_MARKER_POINT, const char* const title = NULL); +#endif + +#if AF_API_VERSION >= 33 + /** + Renders the input arrays as a 3D scatter-plot to the window + + \param[in] P is an \ref af_array or matrix with the xyz-values of the points + \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot + \param[in] title parameter is used when this function is called in grid mode + + \ingroup gfx_func_draw + */ + void scatter3(const array& P, const af::markerType marker = AF_MARKER_POINT, + const char* const title = NULL); +#endif + /** Renders the input array as a histogram to the window @@ -253,6 +289,17 @@ class AFAPI Window { */ bool close(); +#if AF_API_VERSION >= 33 + /** + Hide/Show the window + + \param[in] isVisible indicates if the window is to be hidden or brought into focus + + \ingroup gfx_func_window + */ + void setVisibility(const bool isVisible); +#endif + /** This function is used to keep track of which cell in the grid mode is being currently rendered. When a user does Window(0,0), we internally @@ -371,6 +418,47 @@ AFAPI af_err af_draw_image(const af_window wind, const af_array in, const af_cel */ AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props); +#if AF_API_VERSION >= 33 +/** + C Interface wrapper for drawing an array as a plot + + \param[in] wind is the window handle + \param[in] X is an \ref af_array with the x-axis data points + \param[in] Y is an \ref af_array with the y-axis data points + \param[in] marker is an \ref af_marker_type enum specifying which marker to use in the scatter plot + \param[in] props is structure \ref af_cell that has the properties that are used + for the current rendering. + + \return \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code + is returned. + + \note \p X and \p Y should be vectors. + + \ingroup gfx_func_draw +*/ +AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, + const af_marker_type marker, const af_cell* const props); +#endif + +#if AF_API_VERSION >= 33 +/** + C Interface wrapper for drawing an array as a plot + + \param[in] wind is the window handle + \param[in] P is an \ref af_array or matrix with the xyz-values of the points + \param[in] marker is an \ref af_marker_type enum specifying which marker to use in the scatter plot + \param[in] props is structure \ref af_cell that has the properties that are used + for the current rendering. + + \return \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code + is returned. + + \ingroup gfx_func_draw +*/ +AFAPI af_err af_draw_scatter3(const af_window wind, const af_array P, + const af_marker_type marker, const af_cell* const props); +#endif + #if AF_API_VERSION >= 32 /** C Interface wrapper for drawing an array as a plot @@ -470,6 +558,18 @@ AFAPI af_err af_show(const af_window wind); */ AFAPI af_err af_is_window_closed(bool *out, const af_window wind); +#if AF_API_VERSION >= 33 +/** + Hide/Show a window + + \param[in] wind is the window whose visibility is to be changed + \param[in] is_visible indicates if the window is to be hidden or brought into focus + + \ingroup gfx_func_window + */ +AFAPI af_err af_set_visibility(const af_window wind, const bool is_visible); +#endif + /** C Interface wrapper for destroying a window handle diff --git a/include/af/image.h b/include/af/image.h index f38bb41694..0e0c0ba901 100644 --- a/include/af/image.h +++ b/include/af/image.h @@ -147,6 +147,16 @@ AFAPI array loadImageNative(const char* filename); AFAPI void saveImageNative(const char* filename, const array& in); #endif +#if AF_API_VERSION >= 33 +/** + Function to check if Image IO is available + + \returns true if ArrayFire was commpiled with ImageIO support, false otherwise. + \ingroup imageio_func_available +*/ +AFAPI bool isImageIOAvailable(); +#endif + /** C++ Interface for resizing an image to specified dimensions @@ -213,6 +223,20 @@ AFAPI array rotate(const array& in, const float theta, const bool crop=true, con */ AFAPI array transform(const array& in, const array& transform, const dim_t odim0 = 0, const dim_t odim1 = 0, const interpType method=AF_INTERP_NEAREST, const bool inverse=true); +#if AF_API_VERSION >= 33 +/** + C++ Interface for transforming coordinates + + \param[in] tf is transformation matrix + \param[in] d0 is the first input dimension + \param[in] d1 is the second input dimension + \return the transformed coordinates + + \ingroup transform_func_coordinates +*/ +AFAPI array transformCoordinates(const array& tf, const float d0, const float d1); +#endif + /** C++ Interface for translating an image @@ -794,6 +818,20 @@ extern "C" { AFAPI af_err af_save_image_native(const char* filename, const af_array in); #endif +#if AF_API_VERSION >= 33 + /** + Function to check if Image IO is available + + \param[out] out is true if ArrayFire was commpiled with ImageIO support, + false otherwise. + + \return \ref AF_SUCCESS if successful + + \ingroup imageio_func_available + */ + AFAPI af_err af_is_image_io_available(bool *out); +#endif + /** C Interface for resizing an image to specified dimensions @@ -829,6 +867,21 @@ extern "C" { const dim_t odim0, const dim_t odim1, const af_interp_type method, const bool inverse); +#if AF_API_VERSION >= 33 + /** + C Interface for transforming an image + C++ Interface for transforming coordinates + + \param[out] out the transformed coordinates + \param[in] tf is transformation matrix + \param[in] d0 is the first input dimension + \param[in] d1 is the second input dimension + + \ingroup transform_func_coordinates + */ + AFAPI af_err af_transform_coordinates(af_array *out, const af_array tf, const float d0, const float d1); +#endif + /** C Interface for rotating an image diff --git a/include/af/internal.h b/include/af/internal.h new file mode 100644 index 0000000000..53002929c3 --- /dev/null +++ b/include/af/internal.h @@ -0,0 +1,181 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +#ifdef __cplusplus +namespace af +{ + class array; + +#if AF_API_VERSION >= 33 + /** + \param[in] data is the raw data pointer. + \param[in] offset specifies the number of elements to skip. + \param[in] dims specifies the dimensions for the region of interest. + \param[in] strides specifies the distance between each element of a given dimension. + \param[in] ty specifies the data type of \p data. + \param[in] location specifies if the data is on host or the device. + + \note: If \p location is `afHost`, a memory copy is performed. + + \returns an af::array() with specified offset, dimensions and strides. + + \ingroup internal_func_create + */ + AFAPI array createStridedArray(const void *data, const dim_t offset, + const dim4 dims, const dim4 strides, + const af::dtype ty, + const af::source location); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns af::dim4() containing distance between consecutive elements in each dimension. + + \ingroup internal_func_strides + */ + AFAPI dim4 getStrides(const array &in); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns offset from the starting location of data pointer specified in number of elements. + + \ingroup internal_func_offset + */ + AFAPI dim_t getOffset(const array &in); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns Returns the raw pointer location to the array. + + \note This pointer may be shared with other arrays. Use this function with caution. + + \ingroup internal_func_rawptr + */ + AFAPI void *getRawPtr(const array &in); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns a boolean specifying if all elements in the array are contiguous. + + \ingroup internal_func_linear + */ + AFAPI bool isLinear(const array &in); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns a boolean specifying if the array owns the raw pointer. It is false if it is a sub array. + + \ingroup internal_func_owner + */ + AFAPI bool isOwner(const array &in); +#endif +} +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if AF_API_VERSION >= 33 + /** + \param[out] arr an af_array with specified offset, dimensions and strides. + \param[in] data is the raw data pointer. + \param[in] offset specifies the number of elements to skip. + \param[in] ndims specifies the number of array dimensions. + \param[in] dims specifies the dimensions for the region of interest. + \param[in] strides specifies the distance between each element of a given dimension. + \param[in] ty specifies the data type of \p data. + \param[in] location specifies if the data is on host or the device. + + \note If \p location is `afHost`, a memory copy is performed. + + \ingroup internal_func_create + */ + AFAPI af_err af_create_strided_array(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims, + const dim_t *const strides, + const af_dtype ty, + const af_source location); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] s0 distance between each consecutive element along first dimension. + \param[out] s1 distance between each consecutive element along second dimension. + \param[out] s2 distance between each consecutive element along third dimension. + \param[out] s3 distance between each consecutive element along fourth dimension. + + \ingroup internal_func_strides + */ + AFAPI af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] offset: Offset from the starting location of data pointer specified in number of elements. distance between each consecutive element along first dimension. + + \ingroup internal_func_offset + */ + AFAPI af_err af_get_offset(dim_t *offset, const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] ptr the raw pointer location to the array. + + \note This pointer may be shared with other arrays. Use this function with caution. + + \ingroup internal_func_rawptr + */ + AFAPI af_err af_get_raw_ptr(void **ptr, const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] result: a boolean specifying if all elements in the array are contiguous. + + \ingroup internal_func_linear + */ + AFAPI af_err af_is_linear(bool *result, const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] result: a boolean specifying if the array owns the raw pointer. It is false if it is a sub array. + + \ingroup internal_func_owner + */ + AFAPI af_err af_is_owner(bool *result, const af_array arr); +#endif + +#ifdef __cplusplus +} +#endif diff --git a/include/af/lapack.h b/include/af/lapack.h index f1cf87ad82..bb54069550 100644 --- a/include/af/lapack.h +++ b/include/af/lapack.h @@ -237,6 +237,18 @@ namespace af */ AFAPI double norm(const array &in, const normType type=AF_NORM_EUCLID, const double p=1, const double q=1); + +#if AF_API_VERSION >= 33 + /** + Returns true is ArrayFire is compiled with LAPACK support + + \returns true is LAPACK support is available, false otherwise + + \ingroup lapack_ops_func_norm + */ + AFAPI bool isLAPACKAvailable(); +#endif + } #endif @@ -425,6 +437,19 @@ extern "C" { */ AFAPI af_err af_norm(double *out, const af_array in, const af_norm_type type, const double p, const double q); +#if AF_API_VERSION >= 33 + /** + Returns true is ArrayFire is compiled with LAPACK support + + \param[out] out is true if LAPACK support is available, false otherwise + + \returns AF_SUCCESS if successful (does not depend on the value of out) + + \ingroup lapack_ops_func_norm + */ + AFAPI af_err af_is_lapack_available(bool *out); +#endif + #ifdef __cplusplus } diff --git a/include/af/opencl.h b/include/af/opencl.h index 271879fdc9..16b85d763f 100644 --- a/include/af/opencl.h +++ b/include/af/opencl.h @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once #if defined(__APPLE__) || defined(__MACOSX) #include #else @@ -19,6 +20,29 @@ extern "C" { #endif +#if AF_API_VERSION >= 33 +typedef enum +{ + AFCL_DEVICE_TYPE_CPU = CL_DEVICE_TYPE_CPU, + AFCL_DEVICE_TYPE_GPU = CL_DEVICE_TYPE_GPU, + AFCL_DEVICE_TYPE_ACC = CL_DEVICE_TYPE_ACCELERATOR, + AFCL_DEVICE_TYPE_UNKNOWN = -1 +} afcl_device_type; +#endif + +#if AF_API_VERSION >= 33 +typedef enum +{ + AFCL_PLATFORM_AMD = 0, + AFCL_PLATFORM_APPLE = 1, + AFCL_PLATFORM_INTEL = 2, + AFCL_PLATFORM_NVIDIA = 3, + AFCL_PLATFORM_BEIGNET = 4, + AFCL_PLATFORM_POCL = 5, + AFCL_PLATFORM_UNKNOWN = -1 +} afcl_platform; +#endif + /** \ingroup opencl_mat @{ @@ -63,6 +87,67 @@ AFAPI af_err afcl_get_device_id(cl_device_id *id); AFAPI af_err afcl_set_device_id(cl_device_id id); #endif +#if AF_API_VERSION >= 33 +/** + Push user provided device control constructs into the ArrayFire device manager pool + + This function should be used only when the user would like ArrayFire to use an + user generated OpenCL context and related objects for ArrayFire operations. + + \param[in] dev is the OpenCL device for which user provided context will be used by ArrayFire + \param[in] ctx is the user provided OpenCL cl_context to be used by ArrayFire + \param[in] que is the user provided OpenCL cl_command_queue to be used by ArrayFire. If this + parameter is NULL, then we create a command queue for the user using the OpenCL + context they provided us. + + \note The cl_* objects are passed onto c++ objects (cl::Device, cl::Context & cl::CommandQueue) + that are defined in the `cl.hpp` OpenCL c++ header provided by Khronos Group Inc. Therefore, please + be aware of the lifetime of the cl_* objects before passing them to ArrayFire. +*/ +AFAPI af_err afcl_add_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que); +#endif + +#if AF_API_VERSION >= 33 +/** + Set active device using cl_context and cl_device_id + + \param[in] dev is the OpenCL device id that is to be set as Active device inside ArrayFire + \param[in] ctx is the OpenCL cl_context being used by ArrayFire +*/ +AFAPI af_err afcl_set_device_context(cl_device_id dev, cl_context ctx); +#endif + +#if AF_API_VERSION >= 33 +/** + Remove the user provided device control constructs from the ArrayFire device manager pool + + This function should be used only when the user would like ArrayFire to remove an already + pushed user generated OpenCL context and related objects. + + \param[in] dev is the OpenCL device id that has to be popped + \param[in] ctx is the cl_context object to be removed from ArrayFire pool + + \note Any reference counts incremented for cl_* objects by ArrayFire internally are decremented + by this func call and you won't be able to call `afcl_set_device_context` on these objects after + this function has been called. +*/ +AFAPI af_err afcl_delete_device_context(cl_device_id dev, cl_context ctx); +#endif + +#if AF_API_VERSION >= 33 +/** + Get the type of the current device +*/ +AFAPI af_err afcl_get_device_type(afcl_device_type *res); +#endif + +#if AF_API_VERSION >= 33 +/** + Get the platform of the current device +*/ +AFAPI af_err afcl_get_platform(afcl_platform *res); +#endif + /** @} */ @@ -147,6 +232,97 @@ namespace afcl } #endif +#if AF_API_VERSION >= 33 +/** + Push user provided device control constructs into the ArrayFire device manager pool + + This function should be used only when the user would like ArrayFire to use an + user generated OpenCL context and related objects for ArrayFire operations. + + \param[in] dev is the OpenCL device for which user provided context will be used by ArrayFire + \param[in] ctx is the user provided OpenCL cl_context to be used by ArrayFire + \param[in] que is the user provided OpenCL cl_command_queue to be used by ArrayFire. If this + parameter is NULL, then we create a command queue for the user using the OpenCL + context they provided us. + + \note The cl_* objects are passed onto c++ objects (cl::Device, cl::Context & cl::CommandQueue) + that are defined in the `cl.hpp` OpenCL c++ header provided by Khronos Group Inc. Therefore, please + be aware of the lifetime of the cl_* objects before passing them to ArrayFire. +*/ +static inline void addDevice(cl_device_id dev, cl_context ctx, cl_command_queue que) +{ + af_err err = afcl_add_device_context(dev, ctx, que); + if (err!=AF_SUCCESS) throw af::exception("Failed to push user provided device/context to ArrayFire pool"); +} +#endif + +#if AF_API_VERSION >= 33 +/** + Set active device using cl_context and cl_device_id + + \param[in] dev is the OpenCL device id that is to be set as Active device inside ArrayFire + \param[in] ctx is the OpenCL cl_context being used by ArrayFire +*/ +static inline void setDevice(cl_device_id dev, cl_context ctx) +{ + af_err err = afcl_set_device_context(dev, ctx); + if (err!=AF_SUCCESS) throw af::exception("Failed to set device based on cl_device_id & cl_context"); +} +#endif + +#if AF_API_VERSION >= 33 +/** + Remove the user provided device control constructs from the ArrayFire device manager pool + + This function should be used only when the user would like ArrayFire to remove an already + pushed user generated OpenCL context and related objects. + + \param[in] dev is the OpenCL device id that has to be popped + \param[in] ctx is the cl_context object to be removed from ArrayFire pool + + \note Any reference counts incremented for cl_* objects by ArrayFire internally are decremented + by this func call and you won't be able to call `afcl_set_device_context` on these objects after + this function has been called. +*/ +static inline void deleteDevice(cl_device_id dev, cl_context ctx) +{ + af_err err = afcl_delete_device_context(dev, ctx); + if (err!=AF_SUCCESS) throw af::exception("Failed to remove the requested device from ArrayFire device pool"); +} +#endif + + +#if AF_API_VERSION >= 33 + typedef afcl_device_type deviceType; + typedef afcl_platform platform; +#endif + +#if AF_API_VERSION >= 33 +/** + Get the type of the current device +*/ +static inline deviceType getDeviceType() +{ + afcl_device_type res = AFCL_DEVICE_TYPE_UNKNOWN; + af_err err = afcl_get_device_type(&res); + if (err!=AF_SUCCESS) throw af::exception("Failed to get OpenCL device type"); + return res; +} +#endif + +#if AF_API_VERSION >= 33 +/** + Get the type of the current device +*/ +static inline platform getPlatform() +{ + afcl_platform res = AFCL_PLATFORM_UNKNOWN; + af_err err = afcl_get_platform(&res); + if (err!=AF_SUCCESS) throw af::exception("Failed to get OpenCL platform"); + return res; +} +#endif + /** Create an af::array object from an OpenCL cl_mem buffer @@ -263,15 +439,15 @@ namespace afcl return afcl::array(af::dim4(dim0, dim1, dim2, dim3), buf, type, retain); } - /** +/** @} - */ - +*/ } namespace af { +#if !defined(AF_OPENCL) template<> AFAPI cl_mem *array::device() const { cl_mem *mem = new cl_mem; @@ -279,6 +455,7 @@ template<> AFAPI cl_mem *array::device() const if (err != AF_SUCCESS) throw af::exception("Failed to get cl_mem from array object"); return mem; } +#endif } diff --git a/include/af/util.h b/include/af/util.h index c1fd96ab24..eef46f47c9 100644 --- a/include/af/util.h +++ b/include/af/util.h @@ -95,7 +95,8 @@ namespace af #if AF_API_VERSION >= 31 /** \param[out] output is the pointer to the c-string that will hold the data. The memory for - output is allocated by the function. The user is responsible for deleting the memory. + output is allocated by the function. The user is responsible for deleting the memory using + af::freeHost() or af_free_host(). \param[in] exp is an expression, generally the name of the array \param[in] arr is the input array \param[in] precision is the precision length for display @@ -108,6 +109,24 @@ namespace af const int precision = 4, const bool transpose = true); #endif +#if AF_API_VERSION >= 33 + /** + \param[in] exp is an expression, generally the name of the array + \param[in] arr is the input array + \param[in] precision is the precision length for display + \param[in] transpose determines whether or not to transpose the array before storing it in + the string + + \return output is the pointer to the c-string that will hold the data. The memory for + output is allocated by the function. The user is responsible for deleting the memory using + af::freeHost() or af_free_host(). + + \ingroup print_func_tostring + */ + AFAPI const char* toString(const char *exp, const array &arr, + const int precision = 4, const bool transpose = true); +#endif + // Purpose of Addition: "How to add Function" documentation AFAPI array exampleFunction(const array& in, const af_someenum_t param); } @@ -229,10 +248,20 @@ extern "C" { AFAPI af_err af_example_function(af_array* out, const af_array in, const af_someenum_t param); /// - ///Get the version information of the library + /// Get the version information of the library /// AFAPI af_err af_get_version(int *major, int *minor, int *patch); + +#if AF_API_VERSION >= 33 + /// + /// Get the revision (commit) information of the library. + /// This returns a constant string from compile time and should not be + /// freed by the user. + /// + AFAPI const char *af_get_revision(); +#endif + #ifdef __cplusplus } #endif diff --git a/include/arrayfire.h b/include/arrayfire.h index 7d9e75a7b4..60df3176d1 100644 --- a/include/arrayfire.h +++ b/include/arrayfire.h @@ -113,6 +113,8 @@ @defgroup lapack_ops_mat Matrix operations inverse, det, rank, norm etc. + + @defgroup lapack_helper LAPACK Helper functions @} @defgroup image_mat Image Processing @@ -207,6 +209,15 @@ @} + @defgroup internal_func Functions to work with internal array layout + @{ + + Functions to work with arrayfire's internal data structure. + + Note: The behavior of these functions is not promised to be consistent across versions. + + @} + @defgroup external Interface Functions @{ diff --git a/src/api/c/array.cpp b/src/api/c/array.cpp new file mode 100644 index 0000000000..cefdde1d75 --- /dev/null +++ b/src/api/c/array.cpp @@ -0,0 +1,96 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ +#include +#include +#include + +const ArrayInfo& +getInfo(const af_array arr, bool check) +{ + const ArrayInfo *info = static_cast(reinterpret_cast(arr)); + + if (check && info->getDevId() != detail::getActiveDeviceId()) { + AF_ERROR("Input Array not created on current device", AF_ERR_DEVICE); + } + + return *info; +} + +af_err af_get_elements(dim_t *elems, const af_array arr) +{ + try { + // Do not check for device mismatch + *elems = getInfo(arr, false).elements(); + } CATCHALL + return AF_SUCCESS; +} + +af_err af_get_type(af_dtype *type, const af_array arr) +{ + try { + // Do not check for device mismatch + *type = getInfo(arr, false).getType(); + } CATCHALL + return AF_SUCCESS; +} + +af_err af_get_dims(dim_t *d0, dim_t *d1, dim_t *d2, dim_t *d3, + const af_array in) +{ + try { + // Do not check for device mismatch + ArrayInfo info = getInfo(in, false); + *d0 = info.dims()[0]; + *d1 = info.dims()[1]; + *d2 = info.dims()[2]; + *d3 = info.dims()[3]; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_get_numdims(unsigned *nd, const af_array in) +{ + try { + // Do not check for device mismatch + ArrayInfo info = getInfo(in, false); + *nd = info.ndims(); + } + CATCHALL + return AF_SUCCESS; +} + + +#undef INSTANTIATE +#define INSTANTIATE(fn1, fn2) \ + af_err fn1(bool *result, const af_array in) \ + { \ + try { \ + ArrayInfo info = getInfo(in, false); \ + *result = info.fn2(); \ + } \ + CATCHALL \ + return AF_SUCCESS; \ + } + +INSTANTIATE(af_is_empty , isEmpty ) +INSTANTIATE(af_is_scalar , isScalar ) +INSTANTIATE(af_is_row , isRow ) +INSTANTIATE(af_is_column , isColumn ) +INSTANTIATE(af_is_vector , isVector ) +INSTANTIATE(af_is_complex , isComplex ) +INSTANTIATE(af_is_real , isReal ) +INSTANTIATE(af_is_double , isDouble ) +INSTANTIATE(af_is_single , isSingle ) +INSTANTIATE(af_is_realfloating, isRealFloating) +INSTANTIATE(af_is_floating , isFloating ) +INSTANTIATE(af_is_integer , isInteger ) +INSTANTIATE(af_is_bool , isBool ) + +#undef INSTANTIATE diff --git a/src/api/c/assign.cpp b/src/api/c/assign.cpp index 13fa179da8..8ff37630e8 100644 --- a/src/api/c/assign.cpp +++ b/src/api/c/assign.cpp @@ -29,6 +29,7 @@ using std::swap; template Array modDims(const Array& in, const af::dim4 &newDims); + template static void assign(Array &out, const unsigned &ndims, const af_seq *index, const Array &in_) @@ -39,7 +40,7 @@ void assign(Array &out, const unsigned &ndims, const af_seq *index, const DIM_ASSERT(0, (outDs.ndims()>=iDims.ndims())); DIM_ASSERT(0, (outDs.ndims()>=(dim_t)ndims)); - evalArray(out); + out.eval(); vector index_(index, index+ndims); @@ -125,7 +126,7 @@ af_err af_assign_seq(af_array *out, ArrayInfo lInfo = getInfo(lhs); - if (ndims == 1 && ndims != (dim_t)lInfo.ndims()) { + if (ndims == 1 && ndims != lInfo.ndims()) { af_array tmp_in, tmp_out; AF_CHECK(af_flat(&tmp_in, lhs)); AF_CHECK(af_assign_seq(&tmp_out, tmp_in, ndims, index, rhs)); @@ -350,10 +351,10 @@ af_err af_assign_gen(af_array *out, throw; } if (is_vector) { AF_CHECK(af_release_array(rhs)); } + + std::swap(*out, output); } CATCHALL; - std::swap(*out, output); - return AF_SUCCESS; } diff --git a/src/api/c/data.cpp b/src/api/c/data.cpp index 4d77fb279e..522eb7dfcb 100644 --- a/src/api/c/data.cpp +++ b/src/api/c/data.cpp @@ -539,62 +539,10 @@ af_err af_iota(af_array *result, const unsigned ndims, const dim_t * const dims, return AF_SUCCESS; } -#undef INSTANTIATE -#define INSTANTIATE(fn1, fn2) \ - af_err fn1(bool *result, const af_array in) \ - { \ - try { \ - ArrayInfo info = getInfo(in); \ - *result = info.fn2(); \ - } \ - CATCHALL \ - return AF_SUCCESS; \ - } - -INSTANTIATE(af_is_empty , isEmpty ) -INSTANTIATE(af_is_scalar , isScalar ) -INSTANTIATE(af_is_row , isRow ) -INSTANTIATE(af_is_column , isColumn ) -INSTANTIATE(af_is_vector , isVector ) -INSTANTIATE(af_is_complex , isComplex ) -INSTANTIATE(af_is_real , isReal ) -INSTANTIATE(af_is_double , isDouble ) -INSTANTIATE(af_is_single , isSingle ) -INSTANTIATE(af_is_realfloating, isRealFloating) -INSTANTIATE(af_is_floating , isFloating ) -INSTANTIATE(af_is_integer , isInteger ) -INSTANTIATE(af_is_bool , isBool ) - -#undef INSTANTIATE - -af_err af_get_dims(dim_t *d0, dim_t *d1, dim_t *d2, dim_t *d3, - const af_array in) -{ - try { - ArrayInfo info = getInfo(in); - *d0 = info.dims()[0]; - *d1 = info.dims()[1]; - *d2 = info.dims()[2]; - *d3 = info.dims()[3]; - } - CATCHALL - return AF_SUCCESS; -} - -af_err af_get_numdims(unsigned *nd, const af_array in) -{ - try { - ArrayInfo info = getInfo(in); - *nd = info.ndims(); - } - CATCHALL - return AF_SUCCESS; -} - template static inline void eval(af_array arr) { - evalArray(getArray(arr)); + getArray(arr).eval(); return; } diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 28b4cc2c49..abe0b01e32 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -15,8 +15,8 @@ #include #include #include -#include #include "err_common.hpp" +#include using namespace detail; @@ -38,7 +38,9 @@ af_err af_get_backend_count(unsigned* num_backends) af_err af_get_available_backends(int* result) { - *result = getBackend(); + try { + *result = getBackend(); + } CATCHALL; return AF_SUCCESS; } @@ -46,18 +48,34 @@ af_err af_get_backend_id(af_backend *result, const af_array in) { try { ARG_ASSERT(1, in != 0); - ArrayInfo info = getInfo(in); + ArrayInfo info = getInfo(in, false); *result = info.getBackendId(); } CATCHALL; return AF_SUCCESS; } +af_err af_get_device_id(int *device, const af_array in) +{ + try { + ARG_ASSERT(1, in != 0); + ArrayInfo info = getInfo(in, false); + *device = info.getDevId(); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_get_active_backend(af_backend *result) +{ + *result = (af_backend)getBackend(); + return AF_SUCCESS; +} + af_err af_init() { try { static bool first = true; if(first) { - getInfo(); + getDeviceInfo(); first = false; } } CATCHALL; @@ -66,15 +84,23 @@ af_err af_init() af_err af_info() { - printf("%s", getInfo().c_str()); + try { + printf("%s", getDeviceInfo().c_str()); + } CATCHALL; return AF_SUCCESS; } -af_err af_get_version(int *major, int *minor, int *patch) +af_err af_info_string(char **str, const bool verbose) { - *major = AF_VERSION_MAJOR; - *minor = AF_VERSION_MINOR; - *patch = AF_VERSION_PATCH; + try { + std::string infoStr = getDeviceInfo(); + af_alloc_host((void**)str, sizeof(char) * (infoStr.size() + 1)); + + // Need to do a deep copy + // str.c_str wont cut it + infoStr.copy(*str, infoStr.size()); + (*str)[infoStr.size()] = '\0'; + } CATCHALL; return AF_SUCCESS; } @@ -130,198 +156,3 @@ af_err af_sync(const int device) } CATCHALL; return AF_SUCCESS; } - -af_err af_device_array(af_array *arr, const void *data, - const unsigned ndims, - const dim_t * const dims, - const af_dtype type) -{ - try { - AF_CHECK(af_init()); - - af_array res; - - DIM_ASSERT(1, ndims >= 1); - dim4 d(1, 1, 1, 1); - for(unsigned i = 0; i < ndims; i++) { - d[i] = dims[i]; - DIM_ASSERT(3, dims[i] >= 1); - } - - switch (type) { - case f32: res = getHandle(createDeviceDataArray(d, data)); break; - case f64: res = getHandle(createDeviceDataArray(d, data)); break; - case c32: res = getHandle(createDeviceDataArray(d, data)); break; - case c64: res = getHandle(createDeviceDataArray(d, data)); break; - case s32: res = getHandle(createDeviceDataArray(d, data)); break; - case u32: res = getHandle(createDeviceDataArray(d, data)); break; - case s64: res = getHandle(createDeviceDataArray(d, data)); break; - case u64: res = getHandle(createDeviceDataArray(d, data)); break; - case s16: res = getHandle(createDeviceDataArray(d, data)); break; - case u16: res = getHandle(createDeviceDataArray(d, data)); break; - case u8 : res = getHandle(createDeviceDataArray(d, data)); break; - case b8 : res = getHandle(createDeviceDataArray(d, data)); break; - default: TYPE_ERROR(4, type); - } - - std::swap(*arr, res); - } CATCHALL; - - return AF_SUCCESS; -} - -af_err af_get_device_ptr(void **data, const af_array arr) -{ - try { - af_dtype type = getInfo(arr).getType(); - - switch (type) { - //FIXME: Perform copy if memory not continuous - case f32: *data = getDevicePtr(getArray(arr)); break; - case f64: *data = getDevicePtr(getArray(arr)); break; - case c32: *data = getDevicePtr(getArray(arr)); break; - case c64: *data = getDevicePtr(getArray(arr)); break; - case s32: *data = getDevicePtr(getArray(arr)); break; - case u32: *data = getDevicePtr(getArray(arr)); break; - case s64: *data = getDevicePtr(getArray(arr)); break; - case u64: *data = getDevicePtr(getArray(arr)); break; - case s16: *data = getDevicePtr(getArray(arr)); break; - case u16: *data = getDevicePtr(getArray(arr)); break; - case u8 : *data = getDevicePtr(getArray(arr)); break; - case b8 : *data = getDevicePtr(getArray(arr)); break; - - default: TYPE_ERROR(4, type); - } - - } CATCHALL; - - return AF_SUCCESS; -} - -template -inline void lockDevicePtr(const af_array arr) -{ - memPop((const T *)getArray(arr).get()); -} - -af_err af_lock_device_ptr(const af_array arr) -{ - try { - af_dtype type = getInfo(arr).getType(); - - switch (type) { - case f32: lockDevicePtr(arr); break; - case f64: lockDevicePtr(arr); break; - case c32: lockDevicePtr(arr); break; - case c64: lockDevicePtr(arr); break; - case s32: lockDevicePtr(arr); break; - case u32: lockDevicePtr(arr); break; - case s64: lockDevicePtr(arr); break; - case u64: lockDevicePtr(arr); break; - case s16: lockDevicePtr(arr); break; - case u16: lockDevicePtr(arr); break; - case u8 : lockDevicePtr(arr); break; - case b8 : lockDevicePtr(arr); break; - default: TYPE_ERROR(4, type); - } - - } CATCHALL; - - return AF_SUCCESS; -} - -template -inline void unlockDevicePtr(const af_array arr) -{ - memPush((const T *)getArray(arr).get()); -} - -af_err af_unlock_device_ptr(const af_array arr) -{ - try { - af_dtype type = getInfo(arr).getType(); - - switch (type) { - case f32: unlockDevicePtr(arr); break; - case f64: unlockDevicePtr(arr); break; - case c32: unlockDevicePtr(arr); break; - case c64: unlockDevicePtr(arr); break; - case s32: unlockDevicePtr(arr); break; - case u32: unlockDevicePtr(arr); break; - case s64: unlockDevicePtr(arr); break; - case u64: unlockDevicePtr(arr); break; - case s16: unlockDevicePtr(arr); break; - case u16: unlockDevicePtr(arr); break; - case u8 : unlockDevicePtr(arr); break; - case b8 : unlockDevicePtr(arr); break; - default: TYPE_ERROR(4, type); - } - - } CATCHALL; - - return AF_SUCCESS; -} - - -af_err af_alloc_device(void **ptr, const dim_t bytes) -{ - try { - AF_CHECK(af_init()); - *ptr = (void *)memAlloc(bytes); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_alloc_pinned(void **ptr, const dim_t bytes) -{ - try { - AF_CHECK(af_init()); - *ptr = (void *)pinnedAlloc(bytes); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_free_device(void *ptr) -{ - try { - memFree((char *)ptr); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_free_pinned(void *ptr) -{ - try { - pinnedFree((char *)ptr); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_device_gc() -{ - try { - garbageCollect(); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) -{ - try { - deviceMemoryInfo(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_set_mem_step_size(const size_t step_bytes) -{ - detail::setMemStepSize(step_bytes); - return AF_SUCCESS; -} - -af_err af_get_mem_step_size(size_t *step_bytes) -{ - *step_bytes = detail::getMemStepSize(); - return AF_SUCCESS; -} diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp index 2b8a441bd3..e95ece6d4b 100644 --- a/src/api/c/err_common.cpp +++ b/src/api/c/err_common.cpp @@ -8,8 +8,10 @@ ********************************************************/ #include +#include #include #include +#include #include #include #include @@ -148,59 +150,15 @@ int DimensionError::getArgIndex() const return argIndex; } -static const int MAX_ERR_SIZE = 1024; -static std::string global_err_string; - void print_error(const string &msg) { - const char* perr = getenv("AF_PRINT_ERRORS"); - if(perr != nullptr) { - if(std::strncmp(perr, "0", 1) != 0) + std::string perr = getEnvVar("AF_PRINT_ERRORS"); + if(!perr.empty()) { + if(perr != "0") fprintf(stderr, "%s\n", msg.c_str()); } - global_err_string = msg; -} - -void af_get_last_error(char **str, dim_t *len) -{ - *len = std::min(MAX_ERR_SIZE, (int)global_err_string.size()); - - if (*len == 0) { - *str = NULL; - } - - *str = new char[*len + 1]; - memcpy(*str, global_err_string.c_str(), *len * sizeof(char)); - - (*str)[*len] = '\0'; - global_err_string = std::string(""); -} - -const char *af_err_to_string(const af_err err) -{ - switch (err) { - case AF_SUCCESS: return "Success"; - case AF_ERR_INTERNAL: return "Internal error"; - case AF_ERR_NO_MEM: return "Device out of memory"; - case AF_ERR_DRIVER: return "Driver not available or incompatible"; - case AF_ERR_RUNTIME: return "Runtime error "; - case AF_ERR_INVALID_ARRAY: return "Invalid array"; - case AF_ERR_ARG: return "Invalid input argument"; - case AF_ERR_SIZE: return "Invalid input size"; - case AF_ERR_DIFF_TYPE: return "Input types are not the same"; - case AF_ERR_NOT_SUPPORTED: return "Function not supported"; - case AF_ERR_NOT_CONFIGURED: return "Function not configured to build"; - case AF_ERR_TYPE: return "Function does not support this data type"; - case AF_ERR_NO_DBL: return "Double precision not supported for this device"; - case AF_ERR_LOAD_LIB: return "Failed to load dynamic library. See http://www.arrayfire.com/docs/unifiedbackend.htm for instructions to set up environment for Unified backend"; - case AF_ERR_LOAD_SYM: return "Failed to load symbol"; - case AF_ERR_ARR_BKND_MISMATCH : - return "There was a mismatch between an array and the current backend"; - case AF_ERR_UNKNOWN: - default: - return "Unknown error"; - } + get_global_error_string() = msg; } af_err processException() @@ -260,3 +218,9 @@ af_err processException() return err; } + +std::string& get_global_error_string() +{ + static std::string global_error_string = std::string(""); + return global_error_string; +} diff --git a/src/api/c/err_common.hpp b/src/api/c/err_common.hpp index c8eb90a7f6..60ef64276b 100644 --- a/src/api/c/err_common.hpp +++ b/src/api/c/err_common.hpp @@ -203,3 +203,7 @@ void print_error(const std::string &msg); __AF_FILENAME__, __LINE__, \ "\n", __err); \ } while(0) + + +static const int MAX_ERR_SIZE = 1024; +std::string& get_global_error_string(); diff --git a/src/api/c/error.cpp b/src/api/c/error.cpp new file mode 100644 index 0000000000..521ca9bef5 --- /dev/null +++ b/src/api/c/error.cpp @@ -0,0 +1,63 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include + +void af_get_last_error(char **str, dim_t *len) +{ + std::string &global_error_string = get_global_error_string(); + dim_t slen = std::min(MAX_ERR_SIZE, (int)global_error_string.size()); + + if (len && slen == 0) { + *len = 0; + *str = NULL; + return; + } + + af_alloc_host((void**)str, sizeof(char) * (slen + 1)); + global_error_string.copy(*str, slen); + + (*str)[slen] = '\0'; + global_error_string = std::string(""); + + if(len) *len = slen; +} + +const char *af_err_to_string(const af_err err) +{ + switch (err) { + case AF_SUCCESS: return "Success"; + case AF_ERR_NO_MEM: return "Device out of memory"; + case AF_ERR_DRIVER: return "Driver not available or incompatible"; + case AF_ERR_RUNTIME: return "Runtime error "; + case AF_ERR_INVALID_ARRAY: return "Invalid array"; + case AF_ERR_ARG: return "Invalid input argument"; + case AF_ERR_SIZE: return "Invalid input size"; + case AF_ERR_TYPE: return "Function does not support this data type"; + case AF_ERR_DIFF_TYPE: return "Input types are not the same"; + case AF_ERR_BATCH: return "Invalid batch configuration"; + case AF_ERR_NOT_SUPPORTED: return "Function not supported"; + case AF_ERR_NOT_CONFIGURED: return "Function not configured to build"; + case AF_ERR_NONFREE: return "Function unavailable. " + "ArrayFire compiled without Non-Free algorithms support"; + case AF_ERR_NO_DBL: return "Double precision not supported for this device"; + case AF_ERR_NO_GFX: return "Graphics functionality unavailable. " + "ArrayFire compiled without Graphics support"; + case AF_ERR_LOAD_LIB: return "Failed to load dynamic library. "; + case AF_ERR_LOAD_SYM: return "Failed to load symbol"; + case AF_ERR_ARR_BKND_MISMATCH: return "There was a mismatch between an array and the current backend"; + case AF_ERR_INTERNAL: return "Internal error"; + case AF_ERR_UNKNOWN: + default: return "Unknown error"; + } +} diff --git a/src/api/c/flip.cpp b/src/api/c/flip.cpp index 3d5bf53da8..09cbaf75e4 100644 --- a/src/api/c/flip.cpp +++ b/src/api/c/flip.cpp @@ -74,9 +74,9 @@ af_err af_flip(af_array *result, const af_array in, const unsigned dim) case u8: out = flipArray (in, dim); break; default: TYPE_ERROR(1, in_type); } + swap(*result, out); } CATCHALL - swap(*result, out); return AF_SUCCESS; } diff --git a/src/api/c/graphics_common.cpp b/src/api/c/graphics_common.cpp index 4b50bc046e..dc5a46b5e1 100644 --- a/src/api/c/graphics_common.cpp +++ b/src/api/c/graphics_common.cpp @@ -13,12 +13,29 @@ #include #include #include +#include using namespace std; template GLenum getGLType() { return GL_FLOAT; } +fg::MarkerType getFGMarker(const af_marker_type af_marker) { + fg::MarkerType fg_marker; + switch (af_marker) { + case AF_MARKER_NONE: fg_marker = fg::FG_NONE; break; + case AF_MARKER_POINT: fg_marker = fg::FG_POINT; break; + case AF_MARKER_CIRCLE: fg_marker = fg::FG_CIRCLE; break; + case AF_MARKER_SQUARE: fg_marker = fg::FG_SQUARE; break; + case AF_MARKER_TRIANGLE: fg_marker = fg::FG_TRIANGLE; break; + case AF_MARKER_CROSS: fg_marker = fg::FG_CROSS; break; + case AF_MARKER_PLUS: fg_marker = fg::FG_PLUS; break; + case AF_MARKER_STAR: fg_marker = fg::FG_STAR; break; + default: fg_marker = fg::FG_NONE; break; + } + return fg_marker; +} + #define INSTANTIATE_GET_FG_TYPE(T, ForgeEnum)\ template<> fg::dtype getGLType() { return ForgeEnum; } @@ -129,8 +146,8 @@ fg::Window* ForgeManager::getMainWindow(const bool dontCreate) static fg::Window* wnd = NULL; // Define AF_DISABLE_GRAPHICS with any value to disable initialization - const char* noGraphicsENV = getenv("AF_DISABLE_GRAPHICS"); - if(!noGraphicsENV) { // If AF_DISABLE_GRAPHICS is not defined + std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS"); + if(noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined if (flag && !dontCreate) { wnd = new fg::Window(WIDTH, HEIGHT, "ArrayFire", NULL, true); CheckGL("End ForgeManager::getMainWindow"); @@ -161,7 +178,7 @@ fg::Image* ForgeManager::getImage(int w, int h, fg::ChannelFormat mode, fg::dtyp return mImgMap[key]; } -fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype type) +fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype) { /* nPoints needs to fall in the range of [0, 2^48] * for the ForgeManager to correctly retrieve @@ -169,18 +186,19 @@ fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype type) * is a limitation on how big of an plot graph can be rendered * using arrayfire graphics funtionality */ assert(nPoints <= 2ll<<48); - long long key = ((nPoints & _48BIT) << 48) | (type & _16BIT); + long long key = ((nPoints & _48BIT) << 48); + key |= (((((dtype & 0x000F) << 12) | (ptype & 0x000F)) << 8) | (mtype & 0x000F)); PltMapIter iter = mPltMap.find(key); if (iter==mPltMap.end()) { - fg::Plot* temp = new fg::Plot(nPoints, type); + fg::Plot* temp = new fg::Plot(nPoints, dtype, ptype, mtype); mPltMap[key] = temp; } return mPltMap[key]; } -fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype type) +fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype) { /* nPoints needs to fall in the range of [0, 2^48] * for the ForgeManager to correctly retrieve @@ -188,11 +206,12 @@ fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype type) * is a limitation on how big of an plot graph can be rendered * using arrayfire graphics funtionality */ assert(nPoints <= 2ll<<48); - long long key = ((nPoints & _48BIT) << 48) | (type & _16BIT); + long long key = ((nPoints & _48BIT) << 48); + key |= (((((dtype & 0x000F) << 12) | (ptype & 0x000F)) << 8) | (mtype & 0x000F)); Plt3MapIter iter = mPlt3Map.find(key); if (iter==mPlt3Map.end()) { - fg::Plot3* temp = new fg::Plot3(nPoints, type); + fg::Plot3* temp = new fg::Plot3(nPoints, dtype, ptype, mtype); mPlt3Map[key] = temp; } diff --git a/src/api/c/graphics_common.hpp b/src/api/c/graphics_common.hpp index 082c0c7ba8..53d4629f68 100644 --- a/src/api/c/graphics_common.hpp +++ b/src/api/c/graphics_common.hpp @@ -30,6 +30,7 @@ GLenum glForceErrorCheck(const char *msg, const char* file, int line); #define ForceCheckGL(msg) glForceErrorCheck(msg, __AF_FILENAME__, __LINE__) #define CheckGLSkip(msg) glErrorSkip (msg, __AF_FILENAME__, __LINE__) +fg::MarkerType getFGMarker(const af_marker_type af_marker); namespace graphics { @@ -82,8 +83,8 @@ class ForgeManager fg::Font* getFont(const bool dontCreate=false); fg::Window* getMainWindow(const bool dontCreate=false); fg::Image* getImage(int w, int h, fg::ChannelFormat mode, fg::dtype type); - fg::Plot* getPlot(int nPoints, fg::dtype type); - fg::Plot3* getPlot3(int nPoints, fg::dtype type); + fg::Plot* getPlot(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype); + fg::Plot3* getPlot3(int nPoints, fg::dtype dtype,fg::PlotType ptype, fg::MarkerType mtype); fg::Histogram* getHistogram(int nBins, fg::dtype type); fg::Surface* getSurface(int nX, int nY, fg::dtype type); diff --git a/src/api/c/handle.hpp b/src/api/c/handle.hpp index 70f17eb18e..ac7b74a193 100644 --- a/src/api/c/handle.hpp +++ b/src/api/c/handle.hpp @@ -16,6 +16,8 @@ #include #include +const ArrayInfo& getInfo(const af_array arr, bool check = true); + template static const detail::Array & getArray(const af_array &arr) diff --git a/src/api/c/image.cpp b/src/api/c/image.cpp index 1d3e0970ba..2c523d0947 100644 --- a/src/api/c/image.cpp +++ b/src/api/c/image.cpp @@ -141,9 +141,9 @@ af_err af_create_window(af_window *out, const int width, const int height, const wnd = new fg::Window(width, height, title, mainWnd); wnd->setFont(fgMngr.getFont()); + *out = reinterpret_cast(wnd); } CATCHALL; - *out = reinterpret_cast(wnd); return AF_SUCCESS; #else AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX); @@ -264,6 +264,28 @@ af_err af_is_window_closed(bool *out, const af_window wind) #endif } +af_err af_set_visibility(const af_window wind, const bool is_visible) +{ +#if defined(WITH_GRAPHICS) + if(wind==0) { + std::cerr<<"Not a valid window"<(wind); + if (is_visible) + wnd->show(); + else + wnd->hide(); + } + CATCHALL; + return AF_SUCCESS; +#else + AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX); +#endif +} + af_err af_destroy_window(const af_window wind) { #if defined(WITH_GRAPHICS) diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp index 746ee69142..5e3f7a59cb 100644 --- a/src/api/c/imageio.cpp +++ b/src/api/c/imageio.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -60,14 +61,15 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP pDst0[indx] = (float) *(src + (x * step + FI_RGBA_RED)); pDst1[indx] = (float) *(src + (x * step + FI_RGBA_GREEN)); pDst2[indx] = (float) *(src + (x * step + FI_RGBA_BLUE)); + if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + FI_RGBA_ALPHA)); } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc pDst0[indx] = (float) *(src + (x * step + 0)); pDst1[indx] = (float) *(src + (x * step + 1)); pDst2[indx] = (float) *(src + (x * step + 2)); + if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + 3)); } - if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + FI_RGBA_ALPHA)); } indx++; } @@ -186,6 +188,9 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor) AF_ERROR("FreeImage Error: Bits per channel not supported", AF_ERR_NOT_SUPPORTED); } + // data type + FREE_IMAGE_TYPE image_type = FreeImage_GetImageType(pBitmap); + // sizes uint fi_w = FreeImage_GetWidth(pBitmap); uint fi_h = FreeImage_GetHeight(pBitmap); @@ -203,21 +208,36 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor) else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else if (fi_color == 1) { if(fi_bpc == 8) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else { //3 channel image if(fi_bpc == 8) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } } else { //output gray irrespective if(fi_color == 1) { //4 channel image @@ -226,14 +246,24 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor) else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else if (fi_color == 3 || fi_color == 4) { if(fi_bpc == 8) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } } @@ -298,6 +328,14 @@ af_err af_save_image(const char* filename, const af_array in_) AF_CHECK(af_mul(&in, in_, c255, false)); AF_CHECK(af_release_array(c255)); free_in = true; + } else if(max_real < 256) { + in = in_; + } else if (max_real < 65536) { + af_array c255 = 0; + AF_CHECK(af_constant(&c255, 257.0, info.ndims(), info.dims().get(), f32)); + AF_CHECK(af_div(&in, in_, c255, false)); + AF_CHECK(af_release_array(c255)); + free_in = true; } else { in = in_; } @@ -333,10 +371,10 @@ af_err af_save_image(const char* filename, const af_array in_) // Copy the array into FreeImage buffer for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { - *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r - *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g - *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b - *(pDstLine + x * step + 3) = (uchar) pSrc3[indx]; // a + *(pDstLine + x * step + FI_RGBA_RED ) = (uchar) pSrc0[indx]; // r + *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g + *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b + *(pDstLine + x * step + FI_RGBA_ALPHA) = (uchar) pSrc3[indx]; // a ++indx; } pDstLine -= nDstPitch; @@ -362,9 +400,9 @@ af_err af_save_image(const char* filename, const af_array in_) // Copy the array into FreeImage buffer for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { - *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r - *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g - *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b + *(pDstLine + x * step + FI_RGBA_RED ) = (uchar) pSrc0[indx]; // r + *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g + *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b ++indx; } pDstLine -= nDstPitch; @@ -602,10 +640,10 @@ af_err af_save_image_memory(void **ptr, const af_array in_, const af_image_forma // Copy the array into FreeImage buffer for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { - *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b - *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g - *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r - *(pDstLine + x * step + 3) = (uchar) pSrc3[indx]; // a + *(pDstLine + x * step + FI_RGBA_RED ) = (uchar) pSrc0[indx]; // r + *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g + *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b + *(pDstLine + x * step + FI_RGBA_ALPHA) = (uchar) pSrc3[indx]; // a ++indx; } pDstLine -= nDstPitch; @@ -631,9 +669,9 @@ af_err af_save_image_memory(void **ptr, const af_array in_, const af_image_forma // Copy the array into FreeImage buffer for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { - *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b - *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g - *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r + *(pDstLine + x * step + FI_RGBA_RED ) = (uchar) pSrc0[indx]; // r + *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g + *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b ++indx; } pDstLine -= nDstPitch; diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp index d50afefb92..76c53f4ab4 100644 --- a/src/api/c/imageio2.cpp +++ b/src/api/c/imageio2.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -58,14 +59,15 @@ static af_err readImage_t(af_array *rImage, const uchar* pSrcLine, const int nSr pDst0[indx] = (T) *(src + (x * step + FI_RGBA_RED)); pDst1[indx] = (T) *(src + (x * step + FI_RGBA_GREEN)); pDst2[indx] = (T) *(src + (x * step + FI_RGBA_BLUE)); + if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + FI_RGBA_ALPHA)); } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc pDst0[indx] = (T) *(src + (x * step + 0)); pDst1[indx] = (T) *(src + (x * step + 1)); pDst2[indx] = (T) *(src + (x * step + 2)); + if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + 3)); } - if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + FI_RGBA_ALPHA)); } indx++; } @@ -162,6 +164,9 @@ af_err af_load_image_native(af_array *out, const char* filename) AF_ERROR("FreeImage Error: Bits per channel not supported", AF_ERR_NOT_SUPPORTED); } + // data type + FREE_IMAGE_TYPE image_type = FreeImage_GetImageType(pBitmap); + // sizes uint fi_w = FreeImage_GetWidth(pBitmap); uint fi_h = FreeImage_GetHeight(pBitmap); @@ -178,21 +183,36 @@ af_err af_load_image_native(af_array *out, const char* filename) else if(fi_bpc == 16) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else if (fi_color == 1) { if(fi_bpc == 8) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else { //3 channel imag if(fi_bpc == 8) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } std::swap(*out,rImage); @@ -236,21 +256,22 @@ static void save_t(T* pDstLine, const af_array in, const dim4 dims, uint nDstPit for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { if(channels == 1) { - *(pDstLine + x * step + FI_RGBA_RED) = (T) pSrc0[indx]; // r -> 0 + *(pDstLine + x * step) = (T) pSrc0[indx]; // r -> 0 } else if(channels >=3) { if((af_dtype) af::dtype_traits::af_type == u8) { - *(pDstLine + x * step + FI_RGBA_BLUE) = (T) pSrc2[indx]; // b -> 0 + *(pDstLine + x * step + FI_RGBA_RED ) = (T) pSrc0[indx]; // r -> 0 *(pDstLine + x * step + FI_RGBA_GREEN) = (T) pSrc1[indx]; // g -> 1 - *(pDstLine + x * step + FI_RGBA_RED) = (T) pSrc0[indx]; // r -> 2 + *(pDstLine + x * step + FI_RGBA_BLUE ) = (T) pSrc2[indx]; // b -> 2 + if(channels >= 4) *(pDstLine + x * step + FI_RGBA_ALPHA) = (T) pSrc3[indx]; // a } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc *(pDstLine + x * step + 0) = (T) pSrc0[indx]; // r -> 0 *(pDstLine + x * step + 1) = (T) pSrc1[indx]; // g -> 1 *(pDstLine + x * step + 2) = (T) pSrc2[indx]; // b -> 2 + if(channels >= 4) *(pDstLine + x * step + 3) = (T) pSrc3[indx]; // a } } - if(channels >= 4) *(pDstLine + x * step + FI_RGBA_ALPHA) = (T) pSrc3[indx]; // a ++indx; } pDstLine = (T*)(((uchar*)pDstLine) - nDstPitch); @@ -373,6 +394,12 @@ af_err af_save_image_native(const char* filename, const af_array in) return AF_SUCCESS; } +af_err af_is_image_io_available(bool *out) +{ + *out = true; + return AF_SUCCESS; +} + #else // WITH_FREEIMAGE #include #include @@ -386,4 +413,10 @@ af_err af_save_image_native(const char* filename, const af_array in) { AF_RETURN_ERROR("ArrayFire compiled without Image IO (FreeImage) support", AF_ERR_NOT_CONFIGURED); } + +af_err af_is_image_io_available(bool *out) +{ + *out = false; + return AF_SUCCESS; +} #endif // WITH_FREEIMAGE diff --git a/src/api/c/index.cpp b/src/api/c/index.cpp index b6eb8ab4cd..4a20ca2b34 100644 --- a/src/api/c/index.cpp +++ b/src/api/c/index.cpp @@ -42,7 +42,7 @@ af_err af_index(af_array *result, const af_array in, const unsigned ndims, const try { ArrayInfo iInfo = getInfo(in); - if (ndims == 1 && ndims != (dim_t)iInfo.ndims()) { + if (ndims == 1 && ndims != iInfo.ndims()) { af_array tmp_in; AF_CHECK(af_flat(&tmp_in, in)); AF_CHECK(af_index(result, tmp_in, ndims, index)); @@ -67,10 +67,10 @@ af_err af_index(af_array *result, const af_array in, const unsigned ndims, const case u8: indexArray (out, in, ndims, index); break; default: TYPE_ERROR(1, in_type); } + swap(*result, out); } CATCHALL - swap(*result, out); return AF_SUCCESS; } @@ -127,11 +127,9 @@ af_err af_lookup(af_array *out, const af_array in, const af_array indices, const case u8: output = lookup(in, indices, dim); break; default : TYPE_ERROR(1, idxType); } + std::swap(*out, output); } CATCHALL; - - std::swap(*out, output); - return AF_SUCCESS; } @@ -232,3 +230,71 @@ af_err af_index_gen(af_array *out, const af_array in, const dim_t ndims, const a return AF_SUCCESS; } + +af_seq af_make_seq(double begin, double end, double step) +{ + af_seq seq = {begin, end, step}; + return seq; +} + +af_err af_create_indexers(af_index_t** indexers) +{ + try { + af_index_t* out = new af_index_t[4]; + std::swap(*indexers, out); + } + CATCHALL; + return AF_SUCCESS; +} + +af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim) +{ + try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(1, (idx!=NULL)); + ARG_ASSERT(2, (dim>=0 && dim<=3)); + indexer[dim].idx.arr = idx; + indexer[dim].isBatch = false; + indexer[dim].isSeq = false; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch) +{ + try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(1, (idx!=NULL)); + ARG_ASSERT(2, (dim>=0 && dim<=3)); + indexer[dim].idx.seq = *idx; + indexer[dim].isBatch = is_batch; + indexer[dim].isSeq = true; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_set_seq_param_indexer(af_index_t* indexer, + const double begin, const double end, const double step, + const dim_t dim, const bool is_batch) +{ + try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(4, (dim>=0 && dim<=3)); + indexer[dim].idx.seq = af_make_seq(begin, end, step); + indexer[dim].isBatch = is_batch; + indexer[dim].isSeq = true; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_release_indexers(af_index_t* indexers) +{ + try { + delete[] indexers; + } + CATCHALL; + return AF_SUCCESS; +} diff --git a/src/api/c/internal.cpp b/src/api/c/internal.cpp new file mode 100644 index 0000000000..47c62c6478 --- /dev/null +++ b/src/api/c/internal.cpp @@ -0,0 +1,170 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "err_common.hpp" +#include + +using namespace detail; + +af_err af_create_strided_array(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims_, + const dim_t *const strides_, + const af_dtype ty, + const af_source location) +{ + try { + + ARG_ASSERT(2, offset >= 0); + ARG_ASSERT(3, ndims >=1 && ndims <= 4); + ARG_ASSERT(4, dims_ != NULL); + ARG_ASSERT(5, strides_ != NULL); + ARG_ASSERT(5, strides_[0] == 1); + + for (int i = 1; i < (int)ndims; i++) { + ARG_ASSERT(5, strides_[i] > 0); + } + + dim4 dims(ndims, dims_); + dim4 strides(ndims, strides_); + + for (int i = ndims; i < 4; i++) { + strides[i] = strides[i - 1] * dims[i - 1]; + } + + bool isdev = location == afDevice; + + af_array res; + AF_CHECK(af_init()); + + switch (ty) { + case f32: res = getHandle(Array(dims, strides, offset, (float *)data, isdev)); break; + case f64: res = getHandle(Array(dims, strides, offset, (double *)data, isdev)); break; + case c32: res = getHandle(Array(dims, strides, offset, (cfloat *)data, isdev)); break; + case c64: res = getHandle(Array(dims, strides, offset, (cdouble *)data, isdev)); break; + case u32: res = getHandle(Array(dims, strides, offset, (uint *)data, isdev)); break; + case s32: res = getHandle(Array(dims, strides, offset, (int *)data, isdev)); break; + case u64: res = getHandle(Array(dims, strides, offset, (uintl *)data, isdev)); break; + case s64: res = getHandle(Array(dims, strides, offset, (intl *)data, isdev)); break; + case u16: res = getHandle(Array(dims, strides, offset, (ushort *)data, isdev)); break; + case s16: res = getHandle(Array(dims, strides, offset, (short *)data, isdev)); break; + case b8 : res = getHandle(Array(dims, strides, offset, (char *)data, isdev)); break; + case u8 : res = getHandle(Array(dims, strides, offset, (uchar *)data, isdev)); break; + default: TYPE_ERROR(6, ty); + } + + std::swap(*arr, res); + } + CATCHALL; + return AF_SUCCESS; +} + +af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array in) +{ + try { + ArrayInfo info = getInfo(in); + *s0 = info.strides()[0]; + *s1 = info.strides()[1]; + *s2 = info.strides()[2]; + *s3 = info.strides()[3]; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_get_offset(dim_t *offset, const af_array arr) +{ + try { + + dim_t res = getInfo(arr).getOffset(); + std::swap(*offset, res); + } + CATCHALL; + return AF_SUCCESS; + +} + +af_err af_get_raw_ptr(void **ptr, const af_array arr) +{ + try { + + void *res = NULL; + + af_dtype ty = getInfo(arr).getType(); + + switch (ty) { + case f32: res = (void *)getRawPtr(getArray(arr)); break; + case f64: res = (void *)getRawPtr(getArray(arr)); break; + case c32: res = (void *)getRawPtr(getArray(arr)); break; + case c64: res = (void *)getRawPtr(getArray(arr)); break; + case u32: res = (void *)getRawPtr(getArray(arr)); break; + case s32: res = (void *)getRawPtr(getArray(arr)); break; + case u64: res = (void *)getRawPtr(getArray(arr)); break; + case s64: res = (void *)getRawPtr(getArray(arr)); break; + case u16: res = (void *)getRawPtr(getArray(arr)); break; + case s16: res = (void *)getRawPtr(getArray(arr)); break; + case b8 : res = (void *)getRawPtr(getArray(arr)); break; + case u8 : res = (void *)getRawPtr(getArray(arr)); break; + default: TYPE_ERROR(6, ty); + } + + std::swap(*ptr, res); + } + CATCHALL; + return AF_SUCCESS; +} + +af_err af_is_linear(bool *result, const af_array arr) +{ + try { + *result = getInfo(arr).isLinear(); + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_is_owner(bool *result, const af_array arr) +{ + try { + + bool res = false; + + af_dtype ty = getInfo(arr).getType(); + + switch (ty) { + case f32: res = (void *)getArray(arr).isOwner(); break; + case f64: res = (void *)getArray(arr).isOwner(); break; + case c32: res = (void *)getArray(arr).isOwner(); break; + case c64: res = (void *)getArray(arr).isOwner(); break; + case u32: res = (void *)getArray(arr).isOwner(); break; + case s32: res = (void *)getArray(arr).isOwner(); break; + case u64: res = (void *)getArray(arr).isOwner(); break; + case s64: res = (void *)getArray(arr).isOwner(); break; + case u16: res = (void *)getArray(arr).isOwner(); break; + case s16: res = (void *)getArray(arr).isOwner(); break; + case b8 : res = (void *)getArray(arr).isOwner(); break; + case u8 : res = (void *)getArray(arr).isOwner(); break; + default: TYPE_ERROR(6, ty); + } + + std::swap(*result, res); + } + CATCHALL; + return AF_SUCCESS; +} diff --git a/src/api/c/lu.cpp b/src/api/c/lu.cpp index c6004bc6cf..1d98e02490 100644 --- a/src/api/c/lu.cpp +++ b/src/api/c/lu.cpp @@ -95,3 +95,13 @@ af_err af_lu_inplace(af_array *pivot, af_array in, const bool is_lapack_piv) return AF_SUCCESS; } + +af_err af_is_lapack_available(bool *out) +{ + try { + *out = isLAPACKAvailable(); + } + CATCHALL; + + return AF_SUCCESS; +} diff --git a/src/api/c/median.cpp b/src/api/c/median.cpp index 716df78028..50bcad25ee 100644 --- a/src/api/c/median.cpp +++ b/src/api/c/median.cpp @@ -37,12 +37,18 @@ static double median(const af_array& in) Array sortedArr = sort(input, 0); + af_array sarrHandle = getHandle(sortedArr); + double result; T resPtr[2]; af_array res = 0; - AF_CHECK(af_index(&res, getHandle(sortedArr), 1, mdSpan)); + AF_CHECK(af_index(&res, sarrHandle, 1, mdSpan)); AF_CHECK(af_get_data_ptr((void*)&resPtr, res)); + AF_CHECK(af_release_array(res)); + AF_CHECK(af_release_array(sarrHandle)); + AF_CHECK(af_release_array(temp)); + if (nElems % 2 == 1) { result = resPtr[0]; } else { @@ -53,9 +59,6 @@ static double median(const af_array& in) } } - AF_CHECK(af_release_array(res)); - AF_CHECK(af_release_array(temp)); - return result; } diff --git a/src/api/c/memory.cpp b/src/api/c/memory.cpp new file mode 100644 index 0000000000..098665ba03 --- /dev/null +++ b/src/api/c/memory.cpp @@ -0,0 +1,263 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "err_common.hpp" +#include + +using namespace detail; + +af_err af_device_array(af_array *arr, const void *data, + const unsigned ndims, + const dim_t * const dims, + const af_dtype type) +{ + try { + AF_CHECK(af_init()); + + af_array res; + + DIM_ASSERT(1, ndims >= 1); + dim4 d(1, 1, 1, 1); + for(unsigned i = 0; i < ndims; i++) { + d[i] = dims[i]; + DIM_ASSERT(3, dims[i] >= 1); + } + + switch (type) { + case f32: res = getHandle(createDeviceDataArray(d, data)); break; + case f64: res = getHandle(createDeviceDataArray(d, data)); break; + case c32: res = getHandle(createDeviceDataArray(d, data)); break; + case c64: res = getHandle(createDeviceDataArray(d, data)); break; + case s32: res = getHandle(createDeviceDataArray(d, data)); break; + case u32: res = getHandle(createDeviceDataArray(d, data)); break; + case s64: res = getHandle(createDeviceDataArray(d, data)); break; + case u64: res = getHandle(createDeviceDataArray(d, data)); break; + case s16: res = getHandle(createDeviceDataArray(d, data)); break; + case u16: res = getHandle(createDeviceDataArray(d, data)); break; + case u8 : res = getHandle(createDeviceDataArray(d, data)); break; + case b8 : res = getHandle(createDeviceDataArray(d, data)); break; + default: TYPE_ERROR(4, type); + } + + std::swap(*arr, res); + } CATCHALL; + + return AF_SUCCESS; +} + +af_err af_get_device_ptr(void **data, const af_array arr) +{ + try { + af_dtype type = getInfo(arr).getType(); + + switch (type) { + //FIXME: Perform copy if memory not continuous + case f32: *data = getDevicePtr(getArray(arr)); break; + case f64: *data = getDevicePtr(getArray(arr)); break; + case c32: *data = getDevicePtr(getArray(arr)); break; + case c64: *data = getDevicePtr(getArray(arr)); break; + case s32: *data = getDevicePtr(getArray(arr)); break; + case u32: *data = getDevicePtr(getArray(arr)); break; + case s64: *data = getDevicePtr(getArray(arr)); break; + case u64: *data = getDevicePtr(getArray(arr)); break; + case s16: *data = getDevicePtr(getArray(arr)); break; + case u16: *data = getDevicePtr(getArray(arr)); break; + case u8 : *data = getDevicePtr(getArray(arr)); break; + case b8 : *data = getDevicePtr(getArray(arr)); break; + + default: TYPE_ERROR(4, type); + } + + } CATCHALL; + + return AF_SUCCESS; +} + +template +inline void lockArray(const af_array arr) +{ + memLock((void *)getArray(arr).get()); +} + +af_err af_lock_device_ptr(const af_array arr) +{ + return af_lock_array(arr); +} + +af_err af_lock_array(const af_array arr) +{ + try { + af_dtype type = getInfo(arr).getType(); + + switch (type) { + case f32: lockArray(arr); break; + case f64: lockArray(arr); break; + case c32: lockArray(arr); break; + case c64: lockArray(arr); break; + case s32: lockArray(arr); break; + case u32: lockArray(arr); break; + case s64: lockArray(arr); break; + case u64: lockArray(arr); break; + case s16: lockArray(arr); break; + case u16: lockArray(arr); break; + case u8 : lockArray(arr); break; + case b8 : lockArray(arr); break; + default: TYPE_ERROR(4, type); + } + + } CATCHALL; + + return AF_SUCCESS; +} + +template +inline void unlockArray(const af_array arr) +{ + memUnlock((void *)getArray(arr).get()); +} + +af_err af_unlock_device_ptr(const af_array arr) +{ + return af_unlock_array(arr); +} + +af_err af_unlock_array(const af_array arr) +{ + try { + af_dtype type = getInfo(arr).getType(); + + switch (type) { + case f32: unlockArray(arr); break; + case f64: unlockArray(arr); break; + case c32: unlockArray(arr); break; + case c64: unlockArray(arr); break; + case s32: unlockArray(arr); break; + case u32: unlockArray(arr); break; + case s64: unlockArray(arr); break; + case u64: unlockArray(arr); break; + case s16: unlockArray(arr); break; + case u16: unlockArray(arr); break; + case u8 : unlockArray(arr); break; + case b8 : unlockArray(arr); break; + default: TYPE_ERROR(4, type); + } + + } CATCHALL; + + return AF_SUCCESS; +} + + +af_err af_alloc_device(void **ptr, const dim_t bytes) +{ + try { + AF_CHECK(af_init()); + *ptr = memAllocUser(bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_alloc_pinned(void **ptr, const dim_t bytes) +{ + try { + AF_CHECK(af_init()); + *ptr = (void *)pinnedAlloc(bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_free_device(void *ptr) +{ + try { + memFreeUser(ptr); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_free_pinned(void *ptr) +{ + try { + pinnedFree((char *)ptr); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_alloc_host(void **ptr, const dim_t bytes) +{ + try { + *ptr = malloc(bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_free_host(void *ptr) +{ + try { + free(ptr); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_print_mem_info(const char *msg, const int device_id) +{ + try { + int device = device_id; + if(device == -1) { + device = getActiveDeviceId(); + } + + if(msg != NULL) ARG_ASSERT(0, strlen(msg) < 256); // 256 character limit on msg + ARG_ASSERT(1, device >= 0 && device < getDeviceCount()); + + printMemInfo(msg ? msg : "", device); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_device_gc() +{ + try { + garbageCollect(); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + try { + deviceMemoryInfo(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_set_mem_step_size(const size_t step_bytes) +{ + try{ + detail::setMemStepSize(step_bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_get_mem_step_size(size_t *step_bytes) +{ + try { + *step_bytes = detail::getMemStepSize(); + } CATCHALL; + return AF_SUCCESS; +} diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp index bb156ffc2c..b8f1fafa6c 100644 --- a/src/api/c/moddims.cpp +++ b/src/api/c/moddims.cpp @@ -23,7 +23,7 @@ template Array modDims(const Array& in, const af::dim4 &newDims) { //FIXME: Figure out a better way - evalArray(in); + in.eval(); Array Out = in; @@ -32,10 +32,24 @@ Array modDims(const Array& in, const af::dim4 &newDims) } Out.modDims(newDims); + Out.setDataDims(newDims); return Out; } +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); + af_err af_moddims(af_array *out, const af_array in, const unsigned ndims, const dim_t * const dims) { diff --git a/src/api/c/plot.cpp b/src/api/c/plot.cpp index 26b58a8b08..a812947228 100644 --- a/src/api/c/plot.cpp +++ b/src/api/c/plot.cpp @@ -27,7 +27,7 @@ using namespace detail; using namespace graphics; template -fg::Plot* setup_plot(const af_array X, const af_array Y) +fg::Plot* setup_plot(const af_array X, const af_array Y, fg::PlotType type, fg::MarkerType marker) { Array xIn = getArray(X); Array yIn = getArray(Y); @@ -39,14 +39,19 @@ fg::Plot* setup_plot(const af_array X, const af_array Y) dim4 rdims(1, 0, 2, 3); - Array Z = join(1, xIn, yIn); - Array P = reorder(Z, rdims); + dim_t elements = xIn.elements(); + dim4 rowDims = dim4(1, elements, 1, 1); - ArrayInfo Xinfo = getInfo(X); - af::dim4 X_dims = Xinfo.dims(); + // Force the vectors to be row vectors + // This ensures we can use join(0,..) and skip reorder + xIn.modDims(rowDims); + yIn.modDims(rowDims); + + // join along first dimension, skip reorder + Array P = join(0, xIn, yIn); ForgeManager& fgMngr = ForgeManager::getInstance(); - fg::Plot* plot = fgMngr.getPlot(X_dims.elements(), getGLType()); + fg::Plot* plot = fgMngr.getPlot(elements, getGLType(), type, marker); plot->setColor(1.0, 0.0, 0.0); plot->setAxesLimits(xmax, xmin, ymax, ymin); plot->setAxesTitles("X Axis", "Y Axis"); @@ -55,11 +60,9 @@ fg::Plot* setup_plot(const af_array X, const af_array Y) return plot; } -#endif -af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props) +af_err plotWrapper(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, fg::PlotType type=fg::FG_LINE, fg::MarkerType marker=fg::FG_NONE) { -#if defined(WITH_GRAPHICS) if(wind==0) { std::cerr<<"Not a valid window"<(X, Y); break; - case s32: plot = setup_plot(X, Y); break; - case u32: plot = setup_plot(X, Y); break; - case s16: plot = setup_plot(X, Y); break; - case u16: plot = setup_plot(X, Y); break; - case u8 : plot = setup_plot(X, Y); break; + case f32: plot = setup_plot(X, Y, type, marker); break; + case s32: plot = setup_plot(X, Y, type, marker); break; + case u32: plot = setup_plot(X, Y, type, marker); break; + case s16: plot = setup_plot(X, Y, type, marker); break; + case u16: plot = setup_plot(X, Y, type, marker); break; + case u8 : plot = setup_plot(X, Y, type, marker); break; default: TYPE_ERROR(1, Xtype); } @@ -101,6 +104,24 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co } CATCHALL; return AF_SUCCESS; +} + +#endif // WITH_GRAPHICS + +af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props) +{ +#if defined(WITH_GRAPHICS) + return plotWrapper(wind, X, Y, props); +#else + return AF_ERR_NO_GFX; +#endif +} + +af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_marker_type af_marker, const af_cell* const props) +{ +#if defined(WITH_GRAPHICS) + fg::MarkerType fg_marker = getFGMarker(af_marker); + return plotWrapper(wind, X, Y, props, fg::FG_SCATTER, fg_marker); #else AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX); #endif diff --git a/src/api/c/plot3.cpp b/src/api/c/plot3.cpp index 1ef30e657e..2e18251b45 100644 --- a/src/api/c/plot3.cpp +++ b/src/api/c/plot3.cpp @@ -30,7 +30,7 @@ using namespace detail; using namespace graphics; template -fg::Plot3* setup_plot3(const af_array P) +fg::Plot3* setup_plot3(const af_array P, fg::PlotType ptype, fg::MarkerType mtype) { Array pIn = getArray(P); ArrayInfo Pinfo = getInfo(P); @@ -46,37 +46,27 @@ fg::Plot3* setup_plot3(const af_array P) P_dims = pIn.dims(); } - T max[3], min[3]; - if(P_dims[0] == 3) { - af_get_data_ptr(max, getHandle(reduce(pIn, 1))); - af_get_data_ptr(min, getHandle(reduce(pIn, 1))); + if(P_dims[1] == 3){ + pIn = transpose(pIn, false); } - if(P_dims[1] == 3) { - af_get_data_ptr(max, getHandle(reduce(pIn, 0))); - af_get_data_ptr(min, getHandle(reduce(pIn, 0))); - } + T max[3], min[3]; + copyData(max, reduce(pIn, 1)); + copyData(min, reduce(pIn, 1)); ForgeManager& fgMngr = ForgeManager::getInstance(); - fg::Plot3* plot3 = fgMngr.getPlot3(P_dims.elements()/3, getGLType()); + fg::Plot3* plot3 = fgMngr.getPlot3(P_dims.elements()/3, getGLType(), ptype, mtype); plot3->setColor(1.0, 0.0, 0.0); plot3->setAxesLimits(max[0], min[0], max[1], min[1], max[2], min[2]); plot3->setAxesTitles("X Axis", "Y Axis", "Z Axis"); - - if(P_dims[1] == 3){ - pIn = transpose(pIn, false); - } copy_plot3(pIn, plot3); - return plot3; } -#endif -af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props) +af_err plot3Wrapper(const af_window wind, const af_array P, const af_cell* const props, const fg::PlotType type=fg::FG_LINE, const fg::MarkerType marker=fg::FG_NONE) { -#if defined(WITH_GRAPHICS) if(wind==0) { std::cerr<<"Not a valid window"<(P); break; - case s32: plot3 = setup_plot3(P); break; - case u32: plot3 = setup_plot3(P); break; - case s16: plot3 = setup_plot3(P); break; - case u16: plot3 = setup_plot3(P); break; - case u8 : plot3 = setup_plot3(P); break; + case f32: plot3 = setup_plot3(P, type, marker); break; + case s32: plot3 = setup_plot3(P, type, marker); break; + case u32: plot3 = setup_plot3(P, type, marker); break; + case s16: plot3 = setup_plot3(P, type, marker); break; + case u16: plot3 = setup_plot3(P, type, marker); break; + case u8 : plot3 = setup_plot3(P, type, marker); break; default: TYPE_ERROR(1, Ptype); } @@ -107,6 +97,24 @@ af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* cons } CATCHALL; return AF_SUCCESS; +} + +#endif // WITH_GRAPHICS + +af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props) +{ +#if defined(WITH_GRAPHICS) + return plot3Wrapper(wind, P, props); +#else + return AF_ERR_NO_GFX; +#endif +} + +af_err af_draw_scatter3(const af_window wind, const af_array P, const af_marker_type af_marker, const af_cell* const props) +{ +#if defined(WITH_GRAPHICS) + fg::MarkerType fg_marker = getFGMarker(af_marker); + return plot3Wrapper(wind, P, props, fg::FG_SCATTER, fg_marker); #else AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX); #endif diff --git a/src/api/c/print.cpp b/src/api/c/print.cpp index d2d9921654..66133503ef 100644 --- a/src/api/c/print.cpp +++ b/src/api/c/print.cpp @@ -14,12 +14,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include @@ -69,7 +71,7 @@ static void print(const char *exp, af_array arr, const int precision, std::ostre os << "[" << info.dims() << "]\n"; #ifndef NDEBUG - os <<" Offsets: [" << info.offsets() << "]" << std::endl; + os <<" Offset: " << info.getOffset() << std::endl; os <<" Strides: [" << info.strides() << "]" << std::endl; #endif @@ -180,8 +182,8 @@ af_err af_array_to_string(char **output, const char *exp, const af_array arr, default: TYPE_ERROR(1, type); } std::string str = ss.str(); - *output = new char[str.size() + 1]; - std::copy(str.begin(), str.end(), *output); + af_alloc_host((void**)output, sizeof(char) * (str.size() + 1)); + str.copy(*output, str.size()); (*output)[str.size()] = '\0'; // don't forget the terminating 0 } CATCHALL; diff --git a/src/api/c/set.cpp b/src/api/c/set.cpp index 1643fad95b..db9b5782e5 100644 --- a/src/api/c/set.cpp +++ b/src/api/c/set.cpp @@ -28,7 +28,9 @@ af_err af_set_unique(af_array *out, const af_array in, const bool is_sorted) { try { - af_dtype type = getInfo(in).getType(); + ArrayInfo in_info = getInfo(in); + ARG_ASSERT(1, in_info.isVector()); + af_dtype type = in_info.getType(); af_array res; switch(type) { @@ -62,8 +64,14 @@ af_err af_set_union(af_array *out, const af_array first, const af_array second, { try { - af_dtype first_type = getInfo(first).getType(); - af_dtype second_type = getInfo(second).getType(); + ArrayInfo first_info = getInfo(first); + ArrayInfo second_info = getInfo(second); + + ARG_ASSERT(1, first_info.isVector()); + ARG_ASSERT(1, second_info.isVector()); + + af_dtype first_type = first_info.getType(); + af_dtype second_type = second_info.getType(); ARG_ASSERT(1, first_type == second_type); @@ -98,8 +106,14 @@ af_err af_set_intersect(af_array *out, const af_array first, const af_array seco { try { - af_dtype first_type = getInfo(first).getType(); - af_dtype second_type = getInfo(second).getType(); + ArrayInfo first_info = getInfo(first); + ArrayInfo second_info = getInfo(second); + + ARG_ASSERT(1, first_info.isVector()); + ARG_ASSERT(1, second_info.isVector()); + + af_dtype first_type = first_info.getType(); + af_dtype second_type = second_info.getType(); ARG_ASSERT(1, first_type == second_type); diff --git a/src/api/c/sift.cpp b/src/api/c/sift.cpp index c7a38582aa..a14badc88d 100644 --- a/src/api/c/sift.cpp +++ b/src/api/c/sift.cpp @@ -54,7 +54,7 @@ af_err af_sift(af_features* feat, af_array* desc, const af_array in, const unsig const bool double_input, const float img_scale, const float feature_ratio) { try { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT ArrayInfo info = getInfo(in); af::dim4 dims = info.dims(); @@ -95,7 +95,7 @@ af_err af_gloh(af_features* feat, af_array* desc, const af_array in, const unsig const bool double_input, const float img_scale, const float feature_ratio) { try { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT ArrayInfo info = getInfo(in); af::dim4 dims = info.dims(); diff --git a/src/api/c/stream.cpp b/src/api/c/stream.cpp index a7b5771ee0..17cc945520 100644 --- a/src/api/c/stream.cpp +++ b/src/api/c/stream.cpp @@ -249,12 +249,17 @@ static af_array checkVersionAndRead(const char *filename, const unsigned index) { char version = 0; - std::fstream fs(filename, std::fstream::in | std::fstream::binary); + std::string filenameStr = std::string(filename); + std::fstream fs(filenameStr, std::fstream::in | std::fstream::binary); // Throw exception if file is not open - if(!fs.is_open()) AF_ERROR("File failed to open", AF_ERR_ARG); + if(!fs.is_open()) { + std::string errStr = "Failed to open: " + filenameStr; + AF_ERROR(errStr.c_str(), AF_ERR_ARG); + } if(fs.peek() == std::fstream::traits_type::eof()) { - AF_ERROR("File is empty", AF_ERR_ARG); + std::string errStr = filenameStr + " is empty"; + AF_ERROR(errStr.c_str(), AF_ERR_ARG); } else { fs.read(&version, sizeof(char)); } @@ -270,13 +275,18 @@ int checkVersionAndFindIndex(const char *filename, const char *k) { char version = 0; std::string key(k); + std::string filenameStr(filename); + std::ifstream fs(filenameStr, std::ifstream::in | std::ifstream::binary); - std::ifstream fs(filename, std::ifstream::in | std::ifstream::binary); // Throw exception if file is not open - if(!fs.is_open()) AF_ERROR("File failed to open", AF_ERR_ARG); + if(!fs.is_open()) { + std::string errStr = "Failed to open: " + filenameStr; + AF_ERROR(errStr.c_str(), AF_ERR_ARG); + } if(fs.peek() == std::ifstream::traits_type::eof()) { - AF_ERROR("File is empty", AF_ERR_ARG); + std::string errStr = filenameStr + " is empty"; + AF_ERROR(errStr.c_str(), AF_ERR_ARG); } else { fs.read(&version, sizeof(char)); } diff --git a/src/api/c/surface.cpp b/src/api/c/surface.cpp index 7db8441163..2394f5f96c 100644 --- a/src/api/c/surface.cpp +++ b/src/api/c/surface.cpp @@ -49,21 +49,29 @@ fg::Surface* setup_surface(const af_array xVals, const af_array yVals, const af_ af::dim4 Y_dims = Yinfo.dims(); af::dim4 Z_dims = Zinfo.dims(); - dim4 rdims(1, 0, 2, 3); - dim4 x_tdims(1, Y_dims[0], 1, 1); - dim4 y_tdims(1, X_dims[0], 1, 1); if(Xinfo.isVector()){ + // Convert xIn is a column vector + xIn.modDims(xIn.elements()); + // Now tile along second dimension + dim4 x_tdims(1, Y_dims[0], 1, 1); xIn = tile(xIn, x_tdims); + + // Convert yIn to a row vector + yIn.modDims(af::dim4(1, yIn.elements())); + // Now tile along first dimension + dim4 y_tdims(X_dims[0], 1, 1, 1); yIn = tile(yIn, y_tdims); - yIn = reorder(yIn, rdims); } - xIn.modDims(xIn.elements()); - yIn.modDims(yIn.elements()); - zIn.modDims(zIn.elements()); - Array Z = join(1, join(1, xIn, yIn), zIn); - Z = reorder(Z, rdims); - Z.modDims(Z.elements()); + // Flatten xIn, yIn and zIn into row vectors + dim4 rowDims = dim4(1, zIn.elements()); + xIn.modDims(rowDims); + yIn.modDims(rowDims); + zIn.modDims(rowDims); + + // Now join along first dimension, skip reorder + std::vector > inputs{xIn, yIn, zIn}; + Array Z = join(0, inputs); ForgeManager& fgMngr = ForgeManager::getInstance(); fg::Surface* surface = fgMngr.getSurface(Z_dims[0], Z_dims[1], getGLType()); diff --git a/src/api/c/transform.cpp b/src/api/c/transform.cpp index bacb008c78..785a05438e 100644 --- a/src/api/c/transform.cpp +++ b/src/api/c/transform.cpp @@ -20,9 +20,9 @@ using namespace detail; template static inline af_array transform(const af_array in, const af_array tf, const af::dim4 &odims, - const af_interp_type method, const bool inverse) + const af_interp_type method, const bool inverse, const bool perspective) { - return getHandle(transform(getArray(in), getArray(tf), odims, method, inverse)); + return getHandle(transform(getArray(in), getArray(tf), odims, method, inverse, perspective)); } af_err af_transform(af_array *out, const af_array in, const af_array tf, @@ -41,10 +41,12 @@ af_err af_transform(af_array *out, const af_array in, const af_array tf, ARG_ASSERT(5, method == AF_INTERP_NEAREST || method == AF_INTERP_BILINEAR || method == AF_INTERP_LOWER); - DIM_ASSERT(2, (tdims[0] == 3 && tdims[1] == 2)); + DIM_ASSERT(2, (tdims[0] == 3 && (tdims[1] == 2 || tdims[1] == 3))); DIM_ASSERT(1, idims.elements() > 0); DIM_ASSERT(1, (idims.ndims() == 2 || idims.ndims() == 3)); + const bool perspective = (tdims[1] == 3); + dim_t o0 = odim0, o1 = odim1; dim_t o2 = idims[2] * tdims[2]; if (odim0 * odim1 == 0) { @@ -55,18 +57,18 @@ af_err af_transform(af_array *out, const af_array in, const af_array tf, af_array output = 0; switch(itype) { - case f32: output = transform(in, tf, odims, method, inverse); break; - case f64: output = transform(in, tf, odims, method, inverse); break; - case c32: output = transform(in, tf, odims, method, inverse); break; - case c64: output = transform(in, tf, odims, method, inverse); break; - case s32: output = transform(in, tf, odims, method, inverse); break; - case u32: output = transform(in, tf, odims, method, inverse); break; - case s64: output = transform(in, tf, odims, method, inverse); break; - case u64: output = transform(in, tf, odims, method, inverse); break; - case s16: output = transform(in, tf, odims, method, inverse); break; - case u16: output = transform(in, tf, odims, method, inverse); break; - case u8: output = transform(in, tf, odims, method, inverse); break; - case b8: output = transform(in, tf, odims, method, inverse); break; + case f32: output = transform(in, tf, odims, method, inverse, perspective); break; + case f64: output = transform(in, tf, odims, method, inverse, perspective); break; + case c32: output = transform(in, tf, odims, method, inverse, perspective); break; + case c64: output = transform(in, tf, odims, method, inverse, perspective); break; + case s32: output = transform(in, tf, odims, method, inverse, perspective); break; + case u32: output = transform(in, tf, odims, method, inverse, perspective); break; + case s64: output = transform(in, tf, odims, method, inverse, perspective); break; + case u64: output = transform(in, tf, odims, method, inverse, perspective); break; + case s16: output = transform(in, tf, odims, method, inverse, perspective); break; + case u16: output = transform(in, tf, odims, method, inverse, perspective); break; + case u8: output = transform(in, tf, odims, method, inverse, perspective); break; + case b8: output = transform(in, tf, odims, method, inverse, perspective); break; default: TYPE_ERROR(1, itype); } std::swap(*out,output); diff --git a/src/api/c/transform_coordinates.cpp b/src/api/c/transform_coordinates.cpp new file mode 100644 index 0000000000..79b448db5d --- /dev/null +++ b/src/api/c/transform_coordinates.cpp @@ -0,0 +1,96 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using af::dim4; +using namespace detail; + +template +static af_array transform_coordinates(const af_array& tf, const float d0, const float d1) +{ + dim_t in_dims[2] = { 4, 3 }; + T h_in[4*3] = { (T)0, (T)0, (T)d1, (T)d1, + (T)0, (T)d0, (T)d0, (T)0, + (T)1, (T)1, (T)1, (T)1 }; + + af_array in = 0; + af_array w = 0; + af_array tmp = 0; + af_array xt = 0; + af_array yt = 0; + af_array t = 0; + + AF_CHECK(af_create_array(&in, h_in, 2, in_dims, (af_dtype) af::dtype_traits::af_type)); + + af_array tfIdx = 0; + af_index_t tfIndexs[2]; + tfIndexs[0].isSeq = true; + tfIndexs[1].isSeq = true; + tfIndexs[0].idx.seq = af_make_seq(0, 2, 1); + tfIndexs[1].idx.seq = af_make_seq(2, 2, 1); + AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs)); + + AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE)); + T h_w[4] = { 1, 1, 1, 1 }; + dim_t w_dims = 4; + AF_CHECK(af_create_array(&w, h_w, 1, &w_dims, (af_dtype) af::dtype_traits::af_type)); + AF_CHECK(af_div(&w, w, tmp, false)); + + tfIndexs[1].idx.seq = af_make_seq(0, 0, 1); + AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs)); + AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE)); + AF_CHECK(af_mul(&xt, tmp, w, false)); + + tfIndexs[1].idx.seq = af_make_seq(1, 1, 1); + AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs)); + AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE)); + AF_CHECK(af_mul(&yt, tmp, w, false)); + + AF_CHECK(af_join(&t, 1, xt, yt)); + + AF_CHECK(af_release_array(w)); + AF_CHECK(af_release_array(tmp)); + AF_CHECK(af_release_array(xt)); + AF_CHECK(af_release_array(yt)); + + return t; +} + +af_err af_transform_coordinates(af_array *out, const af_array tf, const float d0, const float d1) +{ + try { + ArrayInfo tfInfo = getInfo(tf); + dim4 tfDims = tfInfo.dims(); + ARG_ASSERT(1, (tfDims[0]==3 && tfDims[1]==3 && tfDims.ndims()==2)); + + af_array output; + af_dtype type = tfInfo.getType(); + switch(type) { + case f32: output = transform_coordinates(tf, d0, d1); break; + case f64: output = transform_coordinates(tf, d0, d1); break; + default : TYPE_ERROR(1, type); + } + std::swap(*out, output); + } + CATCHALL; + + return AF_SUCCESS; +} diff --git a/src/api/c/util.cpp b/src/api/c/util.cpp deleted file mode 100644 index cc9a07ac4f..0000000000 --- a/src/api/c/util.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include -// The following should be included using double quotes -// to enable it's use in unified wrapper -#include "err_common.hpp" - -af_seq af_make_seq(double begin, double end, double step) -{ - af_seq seq = {begin, end, step}; - return seq; -} - -af_err af_create_indexers(af_index_t** indexers) -{ - try { - af_index_t* out = new af_index_t[4]; - std::swap(*indexers, out); - } - CATCHALL; - return AF_SUCCESS; -} - -af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim) -{ - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(1, (idx!=NULL)); - ARG_ASSERT(2, (dim>=0 && dim<=3)); - try { - indexer[dim].idx.arr = idx; - indexer[dim].isBatch = false; - indexer[dim].isSeq = false; - } - CATCHALL - return AF_SUCCESS; -} - -af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch) -{ - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(1, (idx!=NULL)); - ARG_ASSERT(2, (dim>=0 && dim<=3)); - try { - indexer[dim].idx.seq = *idx; - indexer[dim].isBatch = is_batch; - indexer[dim].isSeq = true; - } - CATCHALL - return AF_SUCCESS; -} - -af_err af_set_seq_param_indexer(af_index_t* indexer, - const double begin, const double end, const double step, - const dim_t dim, const bool is_batch) -{ - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(4, (dim>=0 && dim<=3)); - try { - indexer[dim].idx.seq = af_make_seq(begin, end, step); - indexer[dim].isBatch = is_batch; - indexer[dim].isSeq = true; - } - CATCHALL - return AF_SUCCESS; -} - -af_err af_release_indexers(af_index_t* indexers) -{ - try { - delete[] indexers; - } - CATCHALL; - return AF_SUCCESS; -} diff --git a/src/api/c/version.cpp b/src/api/c/version.cpp new file mode 100644 index 0000000000..91d24cb823 --- /dev/null +++ b/src/api/c/version.cpp @@ -0,0 +1,25 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +af_err af_get_version(int *major, int *minor, int *patch) +{ + *major = AF_VERSION_MAJOR; + *minor = AF_VERSION_MINOR; + *patch = AF_VERSION_PATCH; + + return AF_SUCCESS; +} + +const char *af_get_revision() +{ + return AF_REVISION; +} diff --git a/src/api/cpp/array.cpp b/src/api/cpp/array.cpp index f7931cfa9f..b993e2f7e8 100644 --- a/src/api/cpp/array.cpp +++ b/src/api/cpp/array.cpp @@ -1057,11 +1057,11 @@ af::dtype implicit_dtype(af::dtype scalar_type, af::dtype array_type) void array::lock() const { - AF_THROW(af_lock_device_ptr(get())); + AF_THROW(af_lock_array(get())); } void array::unlock() const { - AF_THROW(af_unlock_device_ptr(get())); + AF_THROW(af_unlock_array(get())); } } diff --git a/src/api/cpp/device.cpp b/src/api/cpp/device.cpp index bec0a60d59..faf0b0e7dd 100644 --- a/src/api/cpp/device.cpp +++ b/src/api/cpp/device.cpp @@ -42,11 +42,32 @@ namespace af return result; } + int getDeviceId(const array &in) + { + int device = getDevice();; + AF_THROW(af_get_device_id(&device, in.get())); + return device; + } + + af::Backend getActiveBackend() + { + af::Backend result = (af::Backend)0; + AF_THROW(af_get_active_backend(&result)); + return result; + } + void info() { AF_THROW(af_info()); } + const char* infoString(const bool verbose) + { + char *str = NULL; + AF_THROW(af_info_string(&str, verbose)); + return (const char *)str; + } + void deviceprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) { deviceInfo(d_name, d_platform, d_toolkit, d_compute); @@ -140,6 +161,23 @@ namespace af AF_THROW(af_free_pinned((void *)ptr)); } + void *allocHost(const size_t elements, const af::dtype type) + { + void *ptr; + AF_THROW(af_alloc_host(&ptr, elements * size_of(type))); + return ptr; + } + + void freeHost(const void *ptr) + { + AF_THROW(af_free_host((void *)ptr)); + } + + void printMemInfo(const char *msg, const int device_id) + { + AF_THROW(af_print_mem_info(msg, device_id)); + } + void deviceGC() { AF_THROW(af_device_gc()); @@ -164,16 +202,21 @@ namespace af return size_bytes; } -#define INSTANTIATE(T) \ - template<> AFAPI \ - T* alloc(const size_t elements) \ - { \ - return (T*)alloc(elements, (af::dtype)dtype_traits::af_type); \ - } \ - template<> AFAPI \ - T* pinned(const size_t elements) \ - { \ - return (T*)pinned(elements, (af::dtype)dtype_traits::af_type); \ +#define INSTANTIATE(T) \ + template<> AFAPI \ + T* alloc(const size_t elements) \ + { \ + return (T*)alloc(elements, (af::dtype)dtype_traits::af_type); \ + } \ + template<> AFAPI \ + T* pinned(const size_t elements) \ + { \ + return (T*)pinned(elements, (af::dtype)dtype_traits::af_type); \ + } \ + template<> AFAPI \ + T* allocHost(const size_t elements) \ + { \ + return (T*)allocHost(elements, (af::dtype)dtype_traits::af_type);\ } INSTANTIATE(float) diff --git a/src/api/cpp/error.hpp b/src/api/cpp/error.hpp index 157f8193ab..c888db8646 100644 --- a/src/api/cpp/error.hpp +++ b/src/api/cpp/error.hpp @@ -8,14 +8,20 @@ ********************************************************/ #include +#include #include #define AF_THROW(fn) do { \ af_err __err = fn; \ if (__err == AF_SUCCESS) break; \ - throw af::exception(__AF_FILENAME__, __LINE__, __err); \ + char *msg = NULL; af_get_last_error(&msg, NULL);\ + af::exception ex(msg, __PRETTY_FUNCTION__, \ + __AF_FILENAME__, __LINE__, __err); \ + af_free_host(msg); \ + throw ex; \ } while(0) -#define AF_THROW_ERR(__msg, __err) do { \ - throw af::exception(__msg, __AF_FILENAME__, __LINE__, __err); \ +#define AF_THROW_ERR(__msg, __err) do { \ + throw af::exception(__msg, __PRETTY_FUNCTION__, \ + __AF_FILENAME__, __LINE__, __err); \ } while(0) diff --git a/src/api/cpp/exception.cpp b/src/api/cpp/exception.cpp index 373ae29c55..f88f98b0f2 100644 --- a/src/api/cpp/exception.cpp +++ b/src/api/cpp/exception.cpp @@ -32,8 +32,8 @@ exception::exception(const char *msg): m_err(AF_ERR_UNKNOWN) exception::exception(const char *file, unsigned line, af_err err): m_err(err) { snprintf(m_msg, sizeof(m_msg) - 1, - "ArrayFire Exception(%d): %s\nIn %s:%u", - (int)err, af_err_to_string(err), file, line); + "ArrayFire Exception (%s:%d):\nIn %s:%u", + af_err_to_string(err), (int)err, file, line); m_msg[sizeof(m_msg)-1] = '\0'; } @@ -41,11 +41,19 @@ exception::exception(const char *file, unsigned line, af_err err): m_err(err) exception::exception(const char *msg, const char *file, unsigned line, af_err err): m_err(err) { snprintf(m_msg, sizeof(m_msg) - 1, - "ArrayFire Exception(%d): %s\nIn %s:%u", - (int)(err), msg, file, line); + "ArrayFire Exception (%s:%d):\n%s\nIn %s:%u", + af_err_to_string(err), (int)(err), msg, file, line); m_msg[sizeof(m_msg)-1] = '\0'; } +exception::exception(const char *msg, const char *func, const char *file, unsigned line, af_err err): m_err(err) +{ + snprintf(m_msg, sizeof(m_msg) - 1, + "ArrayFire Exception (%s:%d):\n%s\nIn function %s\nIn file %s:%u", + af_err_to_string(err), (int)(err), msg, func, file, line); + + m_msg[sizeof(m_msg)-1] = '\0'; +} } diff --git a/src/api/cpp/graphics.cpp b/src/api/cpp/graphics.cpp index b7480195dc..8b53825c25 100644 --- a/src/api/cpp/graphics.cpp +++ b/src/api/cpp/graphics.cpp @@ -79,6 +79,18 @@ void Window::plot(const array& X, const array& Y, const char* const title) AF_THROW(af_draw_plot(get(), X.get(), Y.get(), &temp)); } +void Window::scatter(const array& X, const array& Y, af::markerType marker, const char* const title) +{ + af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; + AF_THROW(af_draw_scatter(get(), X.get(), Y.get(), marker, &temp)); +} + +void Window::scatter3(const array& P, af::markerType marker, const char* const title) +{ + af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; + AF_THROW(af_draw_scatter3(get(), P.get(), marker, &temp)); +} + void Window::plot3(const array& P, const char* const title) { af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; @@ -93,7 +105,6 @@ void Window::hist(const array& X, const double minval, const double maxval, cons } void Window::surface(const array& S, const char* const title){ - //TODO: fix offset on forge? af::array xVals = seq(0, S.dims(0)-1); af::array yVals = seq(0, S.dims(1)-1); af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; @@ -125,4 +136,9 @@ bool Window::close() return temp; } +void Window::setVisibility(const bool isVisible) +{ + AF_THROW(af_set_visibility(get(), isVisible)); +} + } diff --git a/src/api/cpp/imageio.cpp b/src/api/cpp/imageio.cpp index e70b26d1d2..75ef5fe9c4 100644 --- a/src/api/cpp/imageio.cpp +++ b/src/api/cpp/imageio.cpp @@ -68,4 +68,11 @@ void saveImageNative(const char* filename, const array& in) AF_THROW(af_save_image_native(filename, in.get())); } +bool isImageIOAvailable() +{ + bool out = false; + AF_THROW(af_is_image_io_available(&out)); + return out; +} + } diff --git a/src/api/cpp/internal.cpp b/src/api/cpp/internal.cpp new file mode 100644 index 0000000000..bdce6e155c --- /dev/null +++ b/src/api/cpp/internal.cpp @@ -0,0 +1,63 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include "error.hpp" + +namespace af +{ + array createStridedArray(const void *data, const dim_t offset, + const dim4 dims, const dim4 strides, + const af::dtype ty, + const af::source location) + { + af_array res; + AF_THROW(af_create_strided_array(&res, data, offset, + dims.ndims(), dims.get(), strides.get(), + ty, location)); + return array(res); + } + + dim4 getStrides(const array &in) + { + dim_t s0, s1, s2, s3; + AF_THROW(af_get_strides(&s0, &s1, &s2, &s3, in.get())); + return dim4(s0, s1, s2, s3); + } + + dim_t getOffset(const array &in) + { + dim_t offset; + AF_THROW(af_get_offset(&offset, in.get())); + return offset; + } + + void *getRawPtr(const array &in) + { + void *ptr = NULL; + AF_THROW(af_get_raw_ptr(&ptr, in.get())); + return ptr; + } + + bool isLinear(const array &in) + { + bool is_linear = false; + AF_THROW(af_is_linear(&is_linear, in.get())); + return is_linear; + } + + bool isOwner(const array &in) + { + bool is_owner = false; + AF_THROW(af_is_owner(&is_owner, in.get())); + return is_owner; + } + +} diff --git a/src/api/cpp/lapack.cpp b/src/api/cpp/lapack.cpp index cf9b3ecfd2..091c807612 100644 --- a/src/api/cpp/lapack.cpp +++ b/src/api/cpp/lapack.cpp @@ -153,4 +153,11 @@ namespace af AF_THROW(af_norm(&out, in.get(), type, p, q)); return out; } + + bool isLAPACKAvailable() + { + bool out = false; + AF_THROW(af_is_lapack_available(&out)); + return out; + } } diff --git a/src/api/cpp/transform_coordinates.cpp b/src/api/cpp/transform_coordinates.cpp new file mode 100644 index 0000000000..4d896e7194 --- /dev/null +++ b/src/api/cpp/transform_coordinates.cpp @@ -0,0 +1,24 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include "error.hpp" + +namespace af +{ + +array transformCoordinates(const array& tf, const float d0, const float d1) +{ + af_array out = 0; + AF_THROW(af_transform_coordinates(&out, tf.get(), d0, d1)); + return array(out); +} + +} diff --git a/src/api/cpp/util.cpp b/src/api/cpp/util.cpp index a99b8567e0..895d347d92 100644 --- a/src/api/cpp/util.cpp +++ b/src/api/cpp/util.cpp @@ -62,4 +62,10 @@ namespace af return; } + const char* toString(const char *exp, const array &arr, const int precision, const bool transpose) + { + char *output = NULL; + AF_THROW(af_array_to_string(&output, exp, arr.get(), precision, transpose)); + return output; + } } diff --git a/src/api/unified/CMakeLists.txt b/src/api/unified/CMakeLists.txt index 917c6dce42..c44e43b5fc 100644 --- a/src/api/unified/CMakeLists.txt +++ b/src/api/unified/CMakeLists.txt @@ -15,11 +15,12 @@ FILE(GLOB cpp_sources SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources}) FILE(GLOB common_sources - "../c/util.cpp" - "../c/err_common.cpp" - "../c/type_util.cpp" - "../../backend/dim4.cpp" - ) + "../c/version.cpp" + "../c/err_common.cpp" + "../c/type_util.cpp" + "../../backend/dim4.cpp" + "../../backend/util.cpp" + ) SOURCE_GROUP(common FILES ${common_sources}) @@ -30,10 +31,6 @@ ENDIF() # OS Definitions IF(UNIX) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread -Wno-comment") -ELSE(${UNIX}) #Windows - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") - SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj") ENDIF() ADD_LIBRARY(af SHARED diff --git a/src/api/unified/array.cpp b/src/api/unified/array.cpp index 59158ca195..809c9d4e6b 100644 --- a/src/api/unified/array.cpp +++ b/src/api/unified/array.cpp @@ -8,6 +8,7 @@ ********************************************************/ #include +#include #include "symbol_manager.hpp" af_err af_create_array(af_array *arr, const void * const data, const unsigned ndims, const dim_t * const dims, const af_dtype type) @@ -40,8 +41,16 @@ af_err af_get_data_ptr(void *data, const af_array arr) af_err af_release_array(af_array arr) { - CHECK_ARRAYS(arr); - return CALL(arr); + af_backend curr = unified::AFSymbolManager::getInstance().getActiveBackend(); + af_backend other = curr; + + af_err err = af_get_backend_id(&other, arr); + if (err != AF_SUCCESS) return err; + + unified::AFSymbolManager::getInstance().setBackend(other); + err = CALL(arr); + unified::AFSymbolManager::getInstance().setBackend(curr); + return err; } af_err af_retain_array(af_array *out, const af_array in) diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp index 43559a077a..ed8e6a37f6 100644 --- a/src/api/unified/device.cpp +++ b/src/api/unified/device.cpp @@ -35,6 +35,18 @@ af_err af_get_backend_id(af_backend *result, const af_array in) return CALL(result, in); } +af_err af_get_device_id(int *device, const af_array in) +{ + CHECK_ARRAYS(in); + return CALL(device, in); +} + +af_err af_get_active_backend(af_backend *result) +{ + *result = unified::AFSymbolManager::getInstance().getActiveBackend(); + return AF_SUCCESS; +} + af_err af_info() { return CALL_NO_PARAMS(); @@ -45,6 +57,11 @@ af_err af_init() return CALL_NO_PARAMS(); } +af_err af_info_string(char **str, const bool verbose) +{ + return CALL(str, verbose); +} + af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) { return CALL(d_name, d_platform, d_toolkit, d_compute); @@ -95,6 +112,16 @@ af_err af_free_pinned(void *ptr) return CALL(ptr); } +af_err af_alloc_host(void **ptr, const dim_t bytes) +{ + return CALL(ptr, bytes); +} + +af_err af_free_host(void *ptr) +{ + return CALL(ptr); +} + af_err af_device_array(af_array *arr, const void *data, const unsigned ndims, const dim_t * const dims, const af_dtype type) { return CALL(arr, data, ndims, dims, type); @@ -106,6 +133,11 @@ af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, return CALL(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers); } +af_err af_print_mem_info(const char *msg, const int device_id) +{ + return CALL(msg, device_id); +} + af_err af_device_gc() { return CALL_NO_PARAMS(); @@ -133,6 +165,18 @@ af_err af_unlock_device_ptr(const af_array arr) return CALL(arr); } +af_err af_lock_array(const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(arr); +} + +af_err af_unlock_array(const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(arr); +} + af_err af_get_device_ptr(void **ptr, const af_array arr) { CHECK_ARRAYS(arr); diff --git a/src/api/unified/error.cpp b/src/api/unified/error.cpp new file mode 100644 index 0000000000..0224876ec3 --- /dev/null +++ b/src/api/unified/error.cpp @@ -0,0 +1,51 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include "symbol_manager.hpp" + +void af_get_last_error(char **str, dim_t *len) +{ + // Set error message from unified backend + std::string &global_error_string = get_global_error_string(); + dim_t slen = std::min(MAX_ERR_SIZE, (int)global_error_string.size()); + + // If this is true, the error is coming from the unified backend. + if (slen != 0) { + + if (len && slen == 0) { + *len = 0; + *str = NULL; + return; + } + + af_alloc_host((void**)str, sizeof(char) * (slen + 1)); + global_error_string.copy(*str, slen); + + (*str)[slen] = '\0'; + global_error_string = std::string(""); + + if (len) *len = slen; + } else { + // If false, the error is coming from active backend. + typedef void(*af_func)(char **, dim_t *); + af_func func = (af_func)LOAD_SYMBOL(); + func(str, len); + } +} + +const char *af_err_to_string(const af_err err) +{ + typedef char *(*af_func)(af_err); + af_func func = (af_func)LOAD_SYMBOL(); + return func(err); +} diff --git a/src/api/unified/graphics.cpp b/src/api/unified/graphics.cpp index 81076f233c..9e3f1c8b38 100644 --- a/src/api/unified/graphics.cpp +++ b/src/api/unified/graphics.cpp @@ -44,6 +44,18 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co return CALL(wind, X, Y, props); } +af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_marker_type marker, const af_cell* const props) +{ + CHECK_ARRAYS(X, Y); + return CALL(wind, X, Y, marker, props); +} + +af_err af_draw_scatter3(const af_window wind, const af_array P, const af_marker_type marker, const af_cell* const props) +{ + CHECK_ARRAYS(P); + return CALL(wind, P, marker, props); +} + af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props) { CHECK_ARRAYS(P); @@ -77,6 +89,11 @@ af_err af_is_window_closed(bool *out, const af_window wind) return CALL(out, wind); } +af_err af_set_visibility(const af_window wind, const bool is_visible) +{ + return CALL(wind, is_visible); +} + af_err af_destroy_window(const af_window wind) { return CALL(wind); diff --git a/src/api/unified/image.cpp b/src/api/unified/image.cpp index d0f9aa6200..0ee211d585 100644 --- a/src/api/unified/image.cpp +++ b/src/api/unified/image.cpp @@ -55,6 +55,11 @@ af_err af_save_image_native(const char* filename, const af_array in) return CALL(filename, in); } +af_err af_is_image_io_available(bool *out) +{ + return CALL(out); +} + af_err af_resize(af_array *out, const af_array in, const dim_t odim0, const dim_t odim1, const af_interp_type method) { CHECK_ARRAYS(in); @@ -69,6 +74,13 @@ af_err af_transform(af_array *out, const af_array in, const af_array transform, return CALL(out, in, transform, odim0, odim1, method, inverse); } +af_err af_transform_coordinates(af_array *out, const af_array tf, + const float d0, const float d1) +{ + CHECK_ARRAYS(tf); + return CALL(out, tf, d0, d1); +} + af_err af_rotate(af_array *out, const af_array in, const float theta, const bool crop, const af_interp_type method) { diff --git a/src/api/unified/index.cpp b/src/api/unified/index.cpp index 0927dd8b71..4df5926d62 100644 --- a/src/api/unified/index.cpp +++ b/src/api/unified/index.cpp @@ -52,3 +52,37 @@ af_err af_assign_gen( af_array *out, CHECK_ARRAYS(lhs, rhs); return CALL(out, lhs, ndims, indices, rhs); } + +af_seq af_make_seq(double begin, double end, double step) +{ + af_seq seq = {begin, end, step}; + return seq; +} + +af_err af_create_indexers(af_index_t** indexers) +{ + return CALL(indexers); +} + +af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim) +{ + CHECK_ARRAYS(idx); + return CALL(indexer, idx, dim); +} + +af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch) +{ + return CALL(indexer, idx, dim, is_batch); +} + +af_err af_set_seq_param_indexer(af_index_t* indexer, + const double begin, const double end, const double step, + const dim_t dim, const bool is_batch) +{ + return CALL(indexer, begin, end, step, dim, is_batch); +} + +af_err af_release_indexers(af_index_t* indexers) +{ + return CALL(indexers); +} diff --git a/src/api/unified/internal.cpp b/src/api/unified/internal.cpp new file mode 100644 index 0000000000..b9ac0ac277 --- /dev/null +++ b/src/api/unified/internal.cpp @@ -0,0 +1,54 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include "symbol_manager.hpp" + + +af_err af_create_strided_array(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims_, + const dim_t *const strides_, + const af_dtype ty, + const af_source location) +{ + return CALL(arr, data, offset, ndims, dims_, strides_, ty, location); +} + +af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array in) +{ + CHECK_ARRAYS(in); + return CALL(s0, s1, s2, s3, in); +} + +af_err af_get_offset(dim_t *offset, const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(offset, arr); +} + +af_err af_get_raw_ptr(void **ptr, const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(ptr, arr); +} + +af_err af_is_linear(bool *result, const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(result, arr); +} + +af_err af_is_owner(bool *result, const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(result, arr); +} diff --git a/src/api/unified/lapack.cpp b/src/api/unified/lapack.cpp index b2364ac858..8a367017cf 100644 --- a/src/api/unified/lapack.cpp +++ b/src/api/unified/lapack.cpp @@ -96,3 +96,8 @@ af_err af_norm(double *out, const af_array in, const af_norm_type type, const do CHECK_ARRAYS(in); return CALL(out, in, type, p, q); } + +af_err af_is_lapack_available(bool *out) +{ + return CALL(out); +} diff --git a/src/api/unified/symbol_manager.cpp b/src/api/unified/symbol_manager.cpp index f721e30c5f..96cec0b6ac 100644 --- a/src/api/unified/symbol_manager.cpp +++ b/src/api/unified/symbol_manager.cpp @@ -43,25 +43,6 @@ inline string getBkndLibName(const int backend_index) return LIB_AF_BKND_PREFIX + LIB_AF_BKND_NAME[i] + LIB_AF_BKND_SUFFIX; } -inline std::string getEnvVar(const std::string &key) -{ -#if defined(OS_WIN) - DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation - string retVal; - retVal.resize(bufSize); - bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize); - if (!bufSize) { - return string(""); - } else { - retVal.resize(bufSize); - return retVal; - } -#else - char * str = getenv(key.c_str()); - return str==NULL ? string("") : string(str); -#endif -} - /*flag parameter is not used on windows platform */ LibHandle openDynLibrary(const int bknd_idx, int flag=RTLD_LAZY) { @@ -222,8 +203,9 @@ af_err AFSymbolManager::setBackend(af::Backend bknd) activeHandle = defaultHandle; activeBackend = defaultBackend; return AF_SUCCESS; - } else - return AF_ERR_LOAD_LIB; + } else { + UNIFIED_ERROR_LOAD_LIB(); + } } int idx = bknd >> 1; // Convert 1, 2, 4 -> 0, 1, 2 if(bkndHandles[idx]) { @@ -231,7 +213,7 @@ af_err AFSymbolManager::setBackend(af::Backend bknd) activeBackend = bknd; return AF_SUCCESS; } else { - return AF_ERR_LOAD_LIB; + UNIFIED_ERROR_LOAD_LIB(); } } diff --git a/src/api/unified/symbol_manager.hpp b/src/api/unified/symbol_manager.hpp index f26e708728..658ac74b64 100644 --- a/src/api/unified/symbol_manager.hpp +++ b/src/api/unified/symbol_manager.hpp @@ -11,6 +11,9 @@ #include #include #include +#include +#include + #if defined(OS_WIN) #include typedef HMODULE LibHandle; @@ -25,6 +28,13 @@ namespace unified const int NUM_BACKENDS = 3; const int NUM_ENV_VARS = 2; +#define UNIFIED_ERROR_LOAD_LIB() \ + AF_RETURN_ERROR("Failed to load dynamic library. " \ + "See http://www.arrayfire.com/docs/unifiedbackend.htm " \ + "for instructions to set up environment for Unified backend.", \ + AF_ERR_LOAD_LIB) + + class AFSymbolManager { public: static AFSymbolManager& getInstance(); @@ -41,8 +51,9 @@ class AFSymbolManager { template af_err call(const char* symbolName, CalleeArgs... args) { - if (!activeHandle) - return AF_ERR_LOAD_LIB; + if (!activeHandle) { + UNIFIED_ERROR_LOAD_LIB(); + } typedef af_err(*af_func)(CalleeArgs...); af_func funcHandle; #if defined(OS_WIN) @@ -51,12 +62,17 @@ class AFSymbolManager { funcHandle = (af_func)dlsym(activeHandle, symbolName); #endif if (!funcHandle) { - return AF_ERR_LOAD_SYM; + std::string str = "Failed to load symbol: "; + str += symbolName; + AF_RETURN_ERROR(str.c_str(), + AF_ERR_LOAD_SYM); } return funcHandle(args...); } + LibHandle getHandle() { return activeHandle; } + protected: AFSymbolManager(); @@ -93,11 +109,12 @@ bool checkArrays(af_backend activeBackend, T a, Args... arg) // Macro to check af_array as inputs. The arguments to this macro should be // only input af_arrays. Not outputs or other types. -#define CHECK_ARRAYS(...) do { \ - af_backend backendId = unified::AFSymbolManager::getInstance().getActiveBackend(); \ - if(!unified::checkArrays(backendId, __VA_ARGS__)) \ - return AF_ERR_ARR_BKND_MISMATCH; \ -} while(0) +#define CHECK_ARRAYS(...) do { \ + af_backend backendId = unified::AFSymbolManager::getInstance().getActiveBackend(); \ + if(!unified::checkArrays(backendId, __VA_ARGS__)) \ + AF_RETURN_ERROR("Input array does not belong to current backend", \ + AF_ERR_ARR_BKND_MISMATCH); \ + } while(0) #if defined(OS_WIN) #define CALL(...) unified::AFSymbolManager::getInstance().call(__FUNCTION__, __VA_ARGS__) @@ -106,3 +123,9 @@ bool checkArrays(af_backend activeBackend, T a, Args... arg) #define CALL(...) unified::AFSymbolManager::getInstance().call(__func__, __VA_ARGS__) #define CALL_NO_PARAMS() unified::AFSymbolManager::getInstance().call(__func__) #endif + +#if defined(OS_WIN) +#define LOAD_SYMBOL() GetProcAddress(unified::AFSymbolManager::getInstance().getHandle(), __FUNCTION__) +#else +#define LOAD_SYMBOL() dlsym(unified::AFSymbolManager::getInstance().getHandle(), __func__) +#endif diff --git a/src/api/unified/util.cpp b/src/api/unified/util.cpp index 155c4f81b9..178ac87ad8 100644 --- a/src/api/unified/util.cpp +++ b/src/api/unified/util.cpp @@ -56,8 +56,3 @@ af_err af_example_function(af_array* out, const af_array in, const af_someenum_t CHECK_ARRAYS(in); return CALL(out, in, param); } - -af_err af_get_version(int *major, int *minor, int *patch) -{ - return CALL(major, minor, patch); -} diff --git a/src/backend/ArrayInfo.cpp b/src/backend/ArrayInfo.cpp index 219bc1991c..0937641afc 100644 --- a/src/backend/ArrayInfo.cpp +++ b/src/backend/ArrayInfo.cpp @@ -18,35 +18,6 @@ using af::dim4; -dim_t -calcOffset(const af::dim4 &strides, const af::dim4 &offsets) -{ - dim_t offset = 0; - for (int i = 0; i < 4; i++) offset += offsets[i] * strides[i]; - return offset; -} - - -const ArrayInfo& -getInfo(af_array arr) -{ - const ArrayInfo *info = static_cast(reinterpret_cast(arr)); - return *info; -} - -af_err -af_get_elements(dim_t *elems, const af_array arr) -{ - *elems = getInfo(arr).elements(); - return AF_SUCCESS; //FIXME: Catch exceptions correctly -} - -af_err af_get_type(af_dtype *type, const af_array arr) -{ - *type = getInfo(arr).getType(); - return AF_SUCCESS; //FIXME: Catch exceptions correctly -} - dim4 calcStrides(const dim4 &parentDim) { dim4 out(1, 1, 1, 1); @@ -64,33 +35,33 @@ int ArrayInfo::getDevId() const { // The actual device ID is only stored in the first 4 bits of devId // See ArrayInfo.hpp for more - return devId & 0xf; + return devId & 0xff; } void ArrayInfo::setId(int id) const { - // 1 << (backendId + 3) sets the 4th, 5th or 6th bit of devId to 1 + // 1 << (backendId + 8) sets the 9th, 10th or 11th bit of devId to 1 // for CPU, CUDA and OpenCL respectively // See ArrayInfo.hpp for more int backendId = detail::getBackend() >> 1; // Convert enums 1, 2, 4 to ints 0, 1, 2 - const_cast(this)->setId(id | 1 << (backendId + 3)); + const_cast(this)->setId(id | 1 << (backendId + 8)); } void ArrayInfo::setId(int id) { - // 1 << (backendId + 3) sets the 4th, 5th or 6th bit of devId to 1 + // 1 << (backendId + 3) sets the 9th, 10th or 11th bit of devId to 1 // for CPU, CUDA and OpenCL respectively // See ArrayInfo.hpp for more int backendId = detail::getBackend() >> 1; // Convert enums 1, 2, 4 to ints 0, 1, 2 - devId = id | 1 << (backendId + 3); + devId = id | 1 << (backendId + 8); } af_backend ArrayInfo::getBackendId() const { - // devId >> 3 converts the backend info to 1, 2, 4 which are enums + // devId >> 8 converts the backend info to 1, 2, 4 which are enums // for CPU, CUDA and OpenCL respectively // See ArrayInfo.hpp for more - int backendId = devId >> 3; + int backendId = devId >> 8; return (af_backend)backendId; } diff --git a/src/backend/ArrayInfo.hpp b/src/backend/ArrayInfo.hpp index ca6fcd394c..88ba26b6aa 100644 --- a/src/backend/ArrayInfo.hpp +++ b/src/backend/ArrayInfo.hpp @@ -16,9 +16,6 @@ #include #include -dim_t -calcOffset(const af::dim4 &strides, const af::dim4 &offsets); - af::dim4 calcStrides(const af::dim4 &parentDim); @@ -48,14 +45,15 @@ class ArrayInfo int devId; af_dtype type; af::dim4 dim_size; - af::dim4 dim_offsets, dim_strides; + dim_t offset; + af::dim4 dim_strides; public: - ArrayInfo(int id, af::dim4 size, af::dim4 offset, af::dim4 stride, af_dtype af_type): + ArrayInfo(int id, af::dim4 size, dim_t offset_, af::dim4 stride, af_dtype af_type): devId(id), type(af_type), dim_size(size), - dim_offsets(offset), + offset(offset_), dim_strides(stride) { af_init(); @@ -77,13 +75,14 @@ class ArrayInfo const af_dtype& getType() const { return type; } - const af::dim4& offsets() const { return dim_offsets; } + dim_t getOffset() const { return offset; } const af::dim4& strides() const { return dim_strides; } size_t elements() const { return dim_size.elements(); } size_t ndims() const { return dim_size.ndims(); } const af::dim4& dims() const { return dim_size; } + size_t total() const { return offset + dim_strides[3] * dim_size[3]; } int getDevId() const; @@ -97,7 +96,7 @@ class ArrayInfo { dim_size = dims; dim_strides = calcStrides(dims); - dim_offsets = af::dim4(0,0,0,0); + offset = 0; } void resetDims(const af::dim4& dims) @@ -141,12 +140,6 @@ class ArrayInfo static_assert(std::is_standard_layout::value, "ArrayInfo must be a standard layout type"); #endif -// Returns size and time info for an array object. -// Note this doesn't require template parameters. -const ArrayInfo& -getInfo(const af_array arr); - - af::dim4 toDims(const std::vector& seqs, const af::dim4 &parentDims); af::dim4 toOffset(const std::vector& seqs, const af::dim4 &parentDims); diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp new file mode 100644 index 0000000000..83f2de1d8d --- /dev/null +++ b/src/backend/MemoryManager.cpp @@ -0,0 +1,319 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include "MemoryManager.hpp" +#include "dispatch.hpp" +#include "err_common.hpp" +#include "util.hpp" + +namespace common +{ + +MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug): + mem_step_size(1024), + max_buffers(MAX_BUFFERS), + memory(num_devices), + debug_mode(debug) +{ + lock_guard_t lock(this->memory_mutex); + + for (int n = 0; n < num_devices; n++) { + // Calling getMaxMemorySize() here calls the virtual function that returns 0 + // Call it from outside the constructor. + memory[n].max_bytes = ONE_GB; + memory[n].total_bytes = 0; + memory[n].total_buffers = 0; + memory[n].lock_bytes = 0; + memory[n].lock_buffers = 0; + } + + // Check for environment variables + + std::string env_var; + + // Debug mode + env_var = getEnvVar("AF_MEM_DEBUG"); + if (!env_var.empty()) { + this->debug_mode = env_var[0] != '0'; + } + if (this->debug_mode) mem_step_size = 1; + + // Max Buffer count + env_var = getEnvVar("AF_MAX_BUFFERS"); + if (!env_var.empty()) { + this->max_buffers = std::max(1, std::stoi(env_var)); + } +} + +void MemoryManager::setMaxMemorySize() +{ + for (unsigned n = 0; n < memory.size(); n++) { + // Calls garbage collection when: + // total_bytes > memsize * 0.75 when memsize < 4GB + // total_bytes > memsize - 1 GB when memsize >= 4GB + // If memsize returned 0, then use 1GB + size_t memsize = this->getMaxMemorySize(n); + memory[n].max_bytes = memsize == 0 ? ONE_GB : std::max(memsize * 0.75, (double)(memsize - ONE_GB)); + } +} + +void MemoryManager::garbageCollect() +{ + if (this->debug_mode) return; + + lock_guard_t lock(this->memory_mutex); + memory_info& current = this->getCurrentMemoryInfo(); + + // Return if all buffers are locked + if (current.total_buffers == current.lock_buffers) return; + + for (auto &kv : current.free_map) { + size_t num_ptrs = kv.second.size(); + //Free memory by popping the last element + for (int n = num_ptrs-1; n >= 0; n--) { + this->nativeFree(kv.second[n]); + current.total_bytes -= kv.first; + current.total_buffers--; + kv.second.pop_back(); + } + } + current.free_map.clear(); +} + +void MemoryManager::unlock(void *ptr, bool user_unlock) +{ + // Shortcut for empty arrays + if (!ptr) return; + + lock_guard_t lock(this->memory_mutex); + memory_info& current = this->getCurrentMemoryInfo(); + + locked_iter iter = current.locked_map.find((void *)ptr); + + // Pointer not found in locked map + if (iter == current.locked_map.end()) { + // Probably came from user, just free it + this->nativeFree(ptr); + return; + } + + if (user_unlock) { + (iter->second).user_lock = false; + } else { + (iter->second).manager_lock = false; + } + + // Return early if either one is locked + if ((iter->second).user_lock || (iter->second).manager_lock) return; + + size_t bytes = iter->second.bytes; + current.lock_bytes -= iter->second.bytes; + current.lock_buffers--; + + current.locked_map.erase(iter); + + if (this->debug_mode) { + // Just free memory in debug mode + if ((iter->second).bytes > 0) { + this->nativeFree(iter->first); + current.total_buffers--; + current.total_bytes -= iter->second.bytes; + } + } else { + // In regular mode, move buffer to free map + free_iter fiter = current.free_map.find(bytes); + if (fiter != current.free_map.end()) { + // If found, push back + fiter->second.push_back(ptr); + } else { + // If not found, create new vector for this size + std::vector ptrs; + ptrs.push_back(ptr); + current.free_map[bytes] = ptrs; + } + } +} + +void *MemoryManager::alloc(const size_t bytes, bool user_lock) +{ + lock_guard_t lock(this->memory_mutex); + + void *ptr = NULL; + size_t alloc_bytes = this->debug_mode ? bytes : (divup(bytes, mem_step_size) * mem_step_size); + + if (bytes > 0) { + memory_info& current = this->getCurrentMemoryInfo(); + + // There is no memory cache in debug mode + if (!this->debug_mode) { + + // FIXME: Add better checks for garbage collection + // Perhaps look at total memory available as a metric + if (this->checkMemoryLimit()) { + this->garbageCollect(); + } + + free_iter iter = current.free_map.find(alloc_bytes); + + if (iter != current.free_map.end() && !iter->second.empty()) { + ptr = iter->second.back(); + iter->second.pop_back(); + } + + } + + // Only comes here if buffer size not found or in debug mode + if (ptr == NULL) { + // Perform garbage collection if memory can not be allocated + try { + ptr = this->nativeAlloc(alloc_bytes); + } catch (AfError &ex) { + // If out of memory, run garbage collect and try again + if (ex.getError() != AF_ERR_NO_MEM) throw; + this->garbageCollect(); + ptr = this->nativeAlloc(alloc_bytes); + } + // Increment these two only when it succeeds to come here. + current.total_bytes += alloc_bytes; + current.total_buffers += 1; + } + + + locked_info info = {true, user_lock, alloc_bytes}; + current.locked_map[ptr] = info; + current.lock_bytes += alloc_bytes; + current.lock_buffers++; + } + return ptr; +} + +void MemoryManager::userLock(const void *ptr) +{ + memory_info& current = this->getCurrentMemoryInfo(); + + lock_guard_t lock(this->memory_mutex); + + locked_iter iter = current.locked_map.find(const_cast(ptr)); + + if (iter != current.locked_map.end()) { + iter->second.user_lock = true; + } else { + locked_info info = {false, + true, + 100}; //This number is not relevant + + current.locked_map[(void *)ptr] = info; + } +} + +void MemoryManager::userUnlock(const void *ptr) +{ + this->unlock(const_cast(ptr), true); +} + +size_t MemoryManager::getMemStepSize() +{ + lock_guard_t lock(this->memory_mutex); + return this->mem_step_size; +} + +void MemoryManager::setMemStepSize(size_t new_step_size) +{ + lock_guard_t lock(this->memory_mutex); + this->mem_step_size = new_step_size; +} + +size_t MemoryManager::getMaxBytes() +{ + lock_guard_t lock(this->memory_mutex); + return this->getCurrentMemoryInfo().max_bytes; +} + +void MemoryManager::printInfo(const char *msg, const int device) +{ + lock_guard_t lock(this->memory_mutex); + memory_info& current = this->getCurrentMemoryInfo(); + + std::cout << msg << std::endl; + + static const std::string head("| POINTER | SIZE | AF LOCK | USER LOCK |"); + static const std::string line(head.size(), '-'); + std::cout << line << std::endl << head << std::endl << line << std::endl; + + for(auto& kv : current.locked_map) { + std::string status_mngr("Yes"); + std::string status_user("Unknown"); + if(kv.second.user_lock) status_user = "Yes"; + else status_user = " No"; + + std::string unit = "KB"; + double size = (double)(kv.second.bytes) / 1024; + if(size >= 1024) { + size = size / 1024; + unit = "MB"; + } + + std::cout << "| " << std::right << std::setw(14) << kv.first << " " + << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit + << " | " << std::setw(9) << status_mngr + << " | " << std::setw(9) << status_user + << " |" << std::endl; + } + + for(auto &kv : current.free_map) { + + std::string status_mngr("No"); + std::string status_user("No"); + + std::string unit = "KB"; + double size = (double)(kv.first) / 1024; + if(size >= 1024) { + size = size / 1024; + unit = "MB"; + } + + for (auto &ptr : kv.second) { + std::cout << "| " << std::right << std::setw(14) << ptr << " " + << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit + << " | " << std::setw(9) << status_mngr + << " | " << std::setw(9) << status_user + << " |" << std::endl; + } + } + + std::cout << line << std::endl; +} + +void MemoryManager::bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + lock_guard_t lock(this->memory_mutex); + memory_info current = this->getCurrentMemoryInfo(); + if (alloc_bytes ) *alloc_bytes = current.total_bytes; + if (alloc_buffers ) *alloc_buffers = current.total_buffers; + if (lock_bytes ) *lock_bytes = current.lock_bytes; + if (lock_buffers ) *lock_buffers = current.lock_buffers; +} + +unsigned MemoryManager::getMaxBuffers() +{ + return this->max_buffers; +} + +bool MemoryManager::checkMemoryLimit() +{ + memory_info& current = this->getCurrentMemoryInfo(); + return current.lock_bytes >= current.max_bytes || current.total_buffers >= this->max_buffers; +} + +} diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp new file mode 100644 index 0000000000..0db70b572d --- /dev/null +++ b/src/backend/MemoryManager.hpp @@ -0,0 +1,121 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once + +#include +#include +#include + +namespace common +{ + +typedef std::recursive_mutex mutex_t; +typedef std::lock_guard lock_guard_t; + +const unsigned MAX_BUFFERS = 1000; +const size_t ONE_GB = 1 << 30; + +class MemoryManager +{ + typedef struct + { + bool manager_lock; + bool user_lock; + size_t bytes; + } locked_info; + + typedef std::map locked_t; + typedef locked_t::iterator locked_iter; + + typedef std::map >free_t; + typedef free_t::iterator free_iter; + + typedef struct + { + locked_t locked_map; + free_t free_map; + + size_t lock_bytes; + size_t lock_buffers; + size_t total_bytes; + size_t total_buffers; + size_t max_bytes; + } memory_info; + + size_t mem_step_size; + unsigned max_buffers; + std::vector memory; + bool debug_mode; + + memory_info& getCurrentMemoryInfo() + { + return memory[this->getActiveDeviceId()]; + } + + virtual int getActiveDeviceId() + { + return 0; + } + + virtual size_t getMaxMemorySize(int id) + { + return 0; + } + +public: + MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug); + + void setMaxMemorySize(); + + void *alloc(const size_t bytes, bool user_lock); + + void unlock(void *ptr, bool user_unlock); + + void garbageCollect(); + + void printInfo(const char *msg, const int device); + + void bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers); + + void userLock(const void *ptr); + + void userUnlock(const void *ptr); + + size_t getMemStepSize(); + + size_t getMaxBytes(); + + unsigned getMaxBuffers(); + + void setMemStepSize(size_t new_step_size); + + virtual void *nativeAlloc(const size_t bytes) + { + return malloc(bytes); + } + + virtual void nativeFree(void *ptr) + { + free((void *)ptr); + } + + virtual ~MemoryManager() + { + } + + bool checkMemoryLimit(); + +protected: + mutex_t memory_mutex; + +}; + +} diff --git a/src/backend/cblas.cpp b/src/backend/cblas.cpp index 4d99d457c2..1be15e47c9 100644 --- a/src/backend/cblas.cpp +++ b/src/backend/cblas.cpp @@ -12,11 +12,11 @@ #ifdef AF_CPU #include #else - #ifdef __APPLE__ - #include + #ifdef USE_MKL + #include #else - #ifdef USE_MKL - #include + #ifdef __APPLE__ + #include #else extern "C" { #include diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 8cf1b752f8..2c296d02d3 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -8,262 +8,241 @@ ********************************************************/ #include +#include #include #include +#include #include #include #include #include +#include #include #include +#include namespace cpu { - const int MAX_TNJ_LEN = 20; - using TNJ::BufferNode; - using TNJ::Node; - using TNJ::Node_ptr; - - using af::dim4; - - template - Array::Array(dim4 dims): - info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), - data(memAlloc(dims.elements()), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) - { } - - template - Array::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device): - info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), - data((is_device & !copy_device) ? (T*)in_data : memAlloc(dims.elements()), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) - { - static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); - static_assert(offsetof(Array, info) == 0, "Array::info must be the first member variable of Array"); - if (!is_device || copy_device) { - std::copy(in_data, in_data + dims.elements(), data.get()); - } - } - - template - Array::Array(af::dim4 dims, TNJ::Node_ptr n) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), - data(), data_dims(dims), - node(n), offset(0), ready(false), owner(true) - { +const int MAX_TNJ_LEN = 20; +using TNJ::BufferNode; +using TNJ::Node; +using TNJ::Node_ptr; + +using af::dim4; + +template +Array::Array(dim4 dims): + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), + data(memAlloc(dims.elements()), memFree), data_dims(dims), + node(), ready(true), owner(true) +{ } + +template +Array::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device): + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), + data((is_device & !copy_device) ? (T*)in_data : memAlloc(dims.elements()), memFree), data_dims(dims), + node(), ready(true), owner(true) +{ + static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); + static_assert(offsetof(Array, info) == 0, "Array::info must be the first member variable of Array"); + if (!is_device || copy_device) { + std::copy(in_data, in_data + dims.elements(), data.get()); } +} - template - Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) : - info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits::af_type), - data(parent.getData()), data_dims(parent.getDataDims()), - node(), - offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), - ready(true), owner(false) - { } - - template - std::shared_ptr evalNodes(const int &num, - const dim4 &odims, - const dim4 &ostrs, - TNJ::Node_ptr &node) - { - - std::shared_ptr data(memAlloc(num), memFree); - T *ptr = data.get(); - - bool is_linear = node->isLinear(odims.get()); - - if (is_linear) { - for (int i = 0; i < num; i++) { - ptr[i] = *(T *)node->calc(i); - } - } else { - for (int w = 0; w < (int)odims[3]; w++) { - dim_t offw = w * ostrs[3]; - - for (int z = 0; z < (int)odims[2]; z++) { - dim_t offz = z * ostrs[2] + offw; - - for (int y = 0; y < (int)odims[1]; y++) { - dim_t offy = y * ostrs[1] + offz; - - for (int x = 0; x < (int)odims[0]; x++) { - dim_t id = x + offy; - - ptr[id] = *(T *)node->calc(x, y, z, w); - } - } - } - } - } - - return data; - } +template +Array::Array(af::dim4 dims, TNJ::Node_ptr n) : + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), + data(), data_dims(dims), + node(n), ready(false), owner(true) +{ +} - template - void Array::eval() - { - if (isReady()) return; +template +Array::Array(const Array& parent, const dim4 &dims, const dim_t &offset_, const dim4 &strides) : + info(parent.getDevId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), + data(parent.getData()), data_dims(parent.getDataDims()), + node(), + ready(true), owner(false) +{ } + +template +Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, + const T * const in_data, bool is_device) : + info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), + data(is_device ? (T*)in_data : memAlloc(info.total()), memFree), + data_dims(dims), + node(), + ready(true), + owner(true) +{ + if (!is_device) { + std::copy(in_data, in_data + info.total(), data.get()); + } +} - this->setId(getActiveDeviceId()); +template +void Array::eval() +{ + if (isReady()) return; + if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL); - data = evalNodes(elements(), dims(), strides(), node); + this->setId(getActiveDeviceId()); - ready = true; - Node_ptr prev = node; - prev->reset(); - // FIXME: Replace the current node in any JIT possible trees with the new BufferNode - node.reset(); - } + data = std::shared_ptr(memAlloc(elements()), memFree); - template - void Array::eval() const - { - if (isReady()) return; - const_cast *>(this)->eval(); - } + getQueue().enqueue(kernel::evalArray, *this); - template - Node_ptr Array::getNode() const - { - if (!node) { + ready = true; + Node_ptr prev = node; + prev->reset(); + // FIXME: Replace the current node in any JIT possible trees with the new BufferNode + node.reset(); +} - unsigned bytes = this->getDataDims().elements() * sizeof(T); +template +void Array::eval() const +{ + if (isReady()) return; + const_cast *>(this)->eval(); +} - BufferNode *buf_node = new BufferNode(data, - bytes, - offset, - dims().get(), - strides().get(), - isLinear()); +template +Node_ptr Array::getNode() const +{ + if (!node) { - const_cast *>(this)->node = Node_ptr(reinterpret_cast(buf_node)); - } + unsigned bytes = this->getDataDims().elements() * sizeof(T); - return node; - } + BufferNode *buf_node = new BufferNode(data, + bytes, + getOffset(), + dims().get(), + strides().get(), + isLinear()); - template - Array - createHostDataArray(const dim4 &size, const T * const data) - { - return Array(size, data, false); + const_cast *>(this)->node = Node_ptr(reinterpret_cast(buf_node)); } - template - Array - createDeviceDataArray(const dim4 &size, const void *data) - { - return Array(size, (const T * const) data, true); - } + return node; +} - template - Array - createValueArray(const dim4 &size, const T& value) - { - TNJ::ScalarNode *node = new TNJ::ScalarNode(value); - return createNodeArray(size, TNJ::Node_ptr( - reinterpret_cast(node))); - } +template +Array +createHostDataArray(const dim4 &size, const T * const data) +{ + return Array(size, data, false); +} - template - Array - createEmptyArray(const dim4 &size) - { - return Array(size); - } +template +Array +createDeviceDataArray(const dim4 &size, const void *data) +{ + return Array(size, (const T * const) data, true); +} - template - Array *initArray() { return new Array(dim4(0, 0, 0, 0)); } +template +Array +createValueArray(const dim4 &size, const T& value) +{ + TNJ::ScalarNode *node = new TNJ::ScalarNode(value); + return createNodeArray(size, TNJ::Node_ptr( + reinterpret_cast(node))); +} +template +Array +createEmptyArray(const dim4 &size) +{ + return Array(size); +} - template - Array - createNodeArray(const dim4 &dims, Node_ptr node) - { - Array out = Array(dims, node); +template +Array *initArray() { return new Array(dim4(0, 0, 0, 0)); } - unsigned length =0, buf_count = 0, bytes = 0; +template +Array +createNodeArray(const dim4 &dims, Node_ptr node) +{ + Array out = Array(dims, node); - Node *n = node.get(); - n->getInfo(length, buf_count, bytes); - n->reset(); + unsigned length =0, buf_count = 0, bytes = 0; - if (length > MAX_TNJ_LEN || - buf_count >= MAX_BUFFERS || - bytes >= MAX_BYTES) { - out.eval(); - } + Node *n = node.get(); + n->getInfo(length, buf_count, bytes); + n->reset(); - return out; + if (length > getMaxJitSize() || + buf_count >= getMaxBuffers() || + bytes >= getMaxBytes()) { + out.eval(); } + return out; +} - template - Array createSubArray(const Array& parent, - const std::vector &index, - bool copy) - { - parent.eval(); - - dim4 dDims = parent.getDataDims(); - dim4 pDims = parent.dims(); +template +Array createSubArray(const Array& parent, + const std::vector &index, + bool copy) +{ + parent.eval(); - dim4 dims = toDims (index, pDims); - dim4 offset = toOffset(index, dDims); - dim4 stride = toStride (index, dDims); + dim4 dDims = parent.getDataDims(); + dim4 pDims = parent.dims(); - Array out = Array(parent, dims, offset, stride); + dim4 dims = toDims (index, pDims); + dim4 strides = toStride (index, dDims); - if (!copy) return out; + // Find total offsets after indexing + dim4 offsets = toOffset(index, pDims); + dim4 parent_strides = parent.strides(); + dim_t offset = parent.getOffset(); + for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i]; - if (stride[0] != 1 || - stride[1] < 0 || - stride[2] < 0 || - stride[3] < 0) { + Array out = Array(parent, dims, offset, strides); - out = copyArray(out); - } + if (!copy) return out; - return out; - } + if (strides[0] != 1 || + strides[1] < 0 || + strides[2] < 0 || + strides[3] < 0) { - template - void - destroyArray(Array *A) - { - delete A; + out = copyArray(out); } + return out; +} - template - void evalArray(const Array &A) - { - A.eval(); - } +template +void +destroyArray(Array *A) +{ + delete A; +} - template - void - writeHostDataArray(Array &arr, const T * const data, const size_t bytes) - { - if(!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); - } - memcpy(arr.get() + arr.getOffset(), data, bytes); +template +void +writeHostDataArray(Array &arr, const T * const data, const size_t bytes) +{ + if(!arr.isOwner()) { + arr = copyArray(arr); } + arr.eval(); + memcpy(arr.get(), data, bytes); +} - template - void - writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) - { - if(!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); - } - memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes); +template +void +writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) +{ + if(!arr.isOwner()) { + arr = copyArray(arr); } + memcpy(arr.get(), (const T * const)data, bytes); +} #define INSTANTIATE(T) \ template Array createHostDataArray (const dim4 &size, const T * const data); \ @@ -275,26 +254,29 @@ namespace cpu const std::vector &index, \ bool copy); \ template void destroyArray (Array *A); \ - template void evalArray (const Array &A); \ template Array createNodeArray (const dim4 &size, TNJ::Node_ptr node); \ template void Array::eval(); \ template void Array::eval() const; \ template Array::Array(af::dim4 dims, const T * const in_data, \ bool is_device, bool copy_device); \ + template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \ + const T * const in_data, \ + bool is_device); \ template TNJ::Node_ptr Array::getNode() const; \ template void writeHostDataArray (Array &arr, const T * const data, const size_t bytes); \ template void writeDeviceDataArray (Array &arr, const void * const data, const size_t bytes); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(short) +INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 471a6741ea..2a3afcf617 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -20,6 +20,19 @@ #include #include #include +#include +#include + +// cpu::Array class forward declaration +namespace cpu +{ +template class Array; +// kernel::evalArray fn forward declaration +namespace kernel +{ +template void evalArray(cpu::Array in); +} +} namespace cpu { @@ -63,9 +76,6 @@ namespace cpu const std::vector &index, bool copy=true); - template - void evalArray(const Array &A); - // Creates a new Array object on the heap and returns a reference to it. template void destroyArray(Array *A); @@ -74,10 +84,16 @@ namespace cpu void *getDevicePtr(const Array& arr) { T *ptr = arr.device(); - memPop(ptr); + memLock(ptr); return (void *)ptr; } + template + void *getRawPtr(const Array& arr) + { + return (void *)(arr.get(false)); + } + // Array Array Implementation template class Array @@ -90,18 +106,22 @@ namespace cpu af::dim4 data_dims; TNJ::Node_ptr node; - dim_t offset; bool ready; bool owner; Array() = default; Array(dim4 dims); + explicit Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device=false); - Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); + Array(const Array& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride); explicit Array(af::dim4 dims, TNJ::Node_ptr n); public: + + Array(af::dim4 dims, af::dim4 strides, dim_t offset, + const T * const in_data, bool is_device = false); + void resetInfo(const af::dim4& dims) { info.resetInfo(dims); } void resetDims(const af::dim4& dims) { info.resetDims(dims); } void modDims(const af::dim4 &newDims) { info.modDims(newDims); } @@ -112,7 +132,6 @@ namespace cpu RET_TYPE NAME() const { return info.NAME(); } INFO_FUNC(const af_dtype& ,getType) - INFO_FUNC(const af::dim4& ,offsets) INFO_FUNC(const af::dim4& ,strides) INFO_FUNC(size_t ,elements) INFO_FUNC(size_t ,ndims) @@ -150,7 +169,7 @@ namespace cpu void eval(); void eval() const; - dim_t getOffset() const { return offset; } + dim_t getOffset() const { return info.getOffset(); } shared_ptr getData() const {return data; } dim4 getDataDims() const @@ -160,8 +179,14 @@ namespace cpu return isOwner() ? info.dims() : data_dims; } + void setDataDims(const dim4 &new_dims) + { + data_dims = new_dims; + } + T* device() { + getQueue().sync(); if (!isOwner() || data.use_count() > 1) { *this = Array(dims(), get(), true, true); } @@ -181,7 +206,7 @@ namespace cpu const T* get(bool withOffset = true) const { if (!isReady()) eval(); - return data.get() + (withOffset ? offset : 0); + return data.get() + (withOffset ? getOffset() : 0); } int useCount() const @@ -204,9 +229,11 @@ namespace cpu const std::vector &index, bool copy); + friend void kernel::evalArray(Array in); + friend void destroyArray(Array *arr); - friend void evalArray(const Array &arr); friend void *getDevicePtr(const Array& arr); + friend void *getRawPtr(const Array& arr); }; } diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index 6ab66245a0..9387323592 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -3,14 +3,25 @@ ADD_DEFINITIONS(-DAF_CPU) FIND_PACKAGE(CBLAS REQUIRED) +OPTION(BUILD_CPU_ASYNC "Build CPU backend with ASYNC support" ON) + +IF (NOT ${BUILD_CPU_ASYNC}) + ADD_DEFINITIONS(-DAF_DISABLE_CPU_ASYNC) +ENDIF() + IF(USE_CPU_F77_BLAS) MESSAGE("Using F77 BLAS") ADD_DEFINITIONS(-DUSE_F77_BLAS) ENDIF() -IF(USE_CPU_MKL) - MESSAGE("Using MKL") +IF(USE_CPU_MKL) # Manual MKL Setup + MESSAGE("CPU Backend Using MKL") ADD_DEFINITIONS(-DUSE_MKL) +ELSE(USE_CPU_MKL) + IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS + MESSAGE("CPU Backend Using MKL RT") + ADD_DEFINITIONS(-DUSE_MKL) + ENDIF() ENDIF() IF (NOT CBLAS_LIBRARIES) @@ -23,16 +34,20 @@ IF(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU" AND "${APPLE}") ADD_DEFINITIONS(-flax-vector-conversions) ENDIF() -IF(${MKL_FOUND}) - ADD_DEFINITIONS(-DUSE_MKL) -ENDIF() - FIND_PACKAGE(FFTW REQUIRED) MESSAGE(STATUS "FFTW Found ? ${FFTW_FOUND}") MESSAGE(STATUS "FFTW Library: ${FFTW_LIBRARIES}") IF(APPLE) - FIND_PACKAGE(LAPACK) + FIND_PACKAGE(LAPACKE QUIET) # For finding MKL + IF(NOT LAPACK_FOUND) + # UNSET THE VARIABLES FROM LAPACKE + UNSET(LAPACKE_LIB CACHE) + UNSET(LAPACK_LIB CACHE) + UNSET(LAPACKE_INCLUDES CACHE) + UNSET(LAPACKE_ROOT_DIR CACHE) + FIND_PACKAGE(LAPACK) + ENDIF() ELSE(APPLE) # Linux and Windows FIND_PACKAGE(LAPACKE) ENDIF(APPLE) @@ -41,15 +56,30 @@ IF(NOT LAPACK_FOUND) MESSAGE(WARNING "LAPACK not found. Functionality will be disabled") ELSE(NOT LAPACK_FOUND) ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA) + MESSAGE(STATUS "LAPACK libraries found: ${LAPACK_LIBRARIES}") ENDIF() IF(NOT UNIX) ADD_DEFINITIONS(-DAFDLL) ENDIF() +SET(THREADS_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/threads") +IF(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}" + AND EXISTS "${THREADS_SRC_DIR}/LICENSE") + # threads submodule has been initialized + # Nothing to do +ELSE() + MESSAGE(STATUS "threads submodule unavailable. Updating submodules.") + EXECUTE_PROCESS( + COMMAND git submodule update --init --recursive + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ) +ENDIF() + INCLUDE_DIRECTORIES( ${CMAKE_INCLUDE_PATH} "${CMAKE_SOURCE_DIR}/src/backend/cpu" + "${CMAKE_SOURCE_DIR}/src/backend/cpu/threads" ${FFTW_INCLUDES} ${CBLAS_INCLUDE_DIR} ) diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp index 2d3beae942..b817b840b4 100644 --- a/src/backend/cpu/approx.cpp +++ b/src/backend/cpu/approx.cpp @@ -9,329 +9,70 @@ #include #include -#include -#include -#include +#include +#include +#include +#include namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Approx1 - /////////////////////////////////////////////////////////////////////////// - template - struct approx1_op - { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - return; - } - }; - template - struct approx1_op - { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idx; - if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - - const Tp x = pos[pmId]; - bool gFlag = false; - if (x < 0 || idims[0] < x+1) { // No need to check y - gFlag = true; - } - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] - + idy * istrides[1]; - const dim_t iMem = round(x) + ioff; - - out[omId] = in[iMem]; - } - } - }; - - template - struct approx1_op - { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idx; - if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - - const Tp x = pos[pmId]; - bool gFlag = false; - if (x < 0 || idims[0] < x+1) { - gFlag = true; - } - - const dim_t grid_x = floor(x); // nearest grid - const Tp off_x = x - grid_x; // fractional offset - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x; - - // Check if x and x + 1 are both valid indices - bool cond = (x < idims[0] - 1); - // Compute Left and Right Weighted Values - Ty yl = ((Tp)1.0 - off_x) * in[ioff]; - Ty yr = cond ? (off_x) * in[ioff + 1] : scalar(0); - Ty yo = yl + yr; - // Compute Weight used - Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x); - // Write final value - out[omId] = (yo / wt); - } - } - }; - - template - void approx1_(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid) - { - approx1_op op; - bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1); - - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(out, odims, oElems, in, idims, iElems, pos, pdims, - ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w); - } - } - } - } - } - - template - Array approx1(const Array &in, const Array &pos, - const af_interp_type method, const float offGrid) - { - af::dim4 odims = in.dims(); - odims[0] = pos.dims()[0]; - - // Create output placeholder - Array out = createEmptyArray(odims); - - switch(method) { - case AF_INTERP_NEAREST: - approx1_ - (out.get(), out.dims(), out.elements(), - in.get(), in.dims(), in.elements(), pos.get(), pos.dims(), - out.strides(), in.strides(), pos.strides(), offGrid); - break; - case AF_INTERP_LINEAR: - approx1_ - (out.get(), out.dims(), out.elements(), - in.get(), in.dims(), in.elements(), pos.get(), pos.dims(), - out.strides(), in.strides(), pos.strides(), offGrid); - break; - default: - break; - } - return out; - } - - /////////////////////////////////////////////////////////////////////////// - // Approx2 - /////////////////////////////////////////////////////////////////////////// - template - struct approx2_op - { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - return; - } - }; - - template - struct approx2_op - { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idy * pstrides[1] + idx; - dim_t qmId = idy * qstrides[1] + idx; - if(pBatch) { - pmId += idw * pstrides[3] + idz * pstrides[2]; - qmId += idw * qstrides[3] + idz * qstrides[2]; - } - - bool gFlag = false; - const Tp x = pos[pmId], y = qos[qmId]; - if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { - gFlag = true; - } - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - const dim_t grid_x = round(x), grid_y = round(y); // nearest grid - const dim_t imId = idw * istrides[3] + idz * istrides[2] + - grid_y * istrides[1] + grid_x; - out[omId] = in[imId]; - } - } - }; - - template - struct approx2_op - { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idy * pstrides[1] + idx; - dim_t qmId = idy * qstrides[1] + idx; - if(pBatch) { - pmId += idw * pstrides[3] + idz * pstrides[2]; - qmId += idw * qstrides[3] + idz * qstrides[2]; - } - - bool gFlag = false; - const Tp x = pos[pmId], y = qos[qmId]; - if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { - gFlag = true; - } - - const dim_t grid_x = floor(x), grid_y = floor(y); // nearest grid - const Tp off_x = x - grid_x, off_y = y - grid_y; // fractional offset - - // Check if pVal and pVal + 1 are both valid indices - bool condY = (y < idims[1] - 1); - bool condX = (x < idims[0] - 1); - - // Compute wieghts used - Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y); - Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0; - Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0; - Tp wt11 = (condX && condY) ? (off_x) * (off_y) : 0; - - Tp wt = wt00 + wt10 + wt01 + wt11; - Ty zero = scalar(0); - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] - + grid_y * istrides[1] + grid_x; - - // Compute Weighted Values - Ty y00 = wt00 * in[ioff]; - Ty y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero; - Ty y01 = (condX) ? wt01 * in[ioff + 1] : zero; - Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero; - - Ty yo = y00 + y10 + y01 + y11; - - // Write Final Value - out[omId] = (yo / wt); - } - } - }; - - template - void approx2_(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid) - { - approx2_op op; - bool pBatch = !(pdims[2] == 1 && pdims[3] == 1); - - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims, - ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w); - } - } - } - } +template +Array approx1(const Array &in, const Array &pos, + const af_interp_type method, const float offGrid) +{ + in.eval(); + pos.eval(); + + af::dim4 odims = in.dims(); + odims[0] = pos.dims()[0]; + + Array out = createEmptyArray(odims); + + switch(method) { + case AF_INTERP_NEAREST: + getQueue().enqueue(kernel::approx1, + out, in, pos, offGrid); + break; + case AF_INTERP_LINEAR: + getQueue().enqueue(kernel::approx1, + out, in, pos, offGrid); + break; + default: + break; } + return out; +} - template - Array approx2(const Array &in, const Array &pos0, const Array &pos1, - const af_interp_type method, const float offGrid) - { - af::dim4 odims = in.dims(); - odims[0] = pos0.dims()[0]; - odims[1] = pos0.dims()[1]; - - // Create output placeholder - Array out = createEmptyArray(odims); - switch(method) { - case AF_INTERP_NEAREST: - approx2_ - (out.get(), out.dims(), out.elements(), - in.get(), in.dims(), in.elements(), - pos0.get(), pos0.dims(), pos1.get(), pos1.dims(), - out.strides(), in.strides(), pos0.strides(), pos1.strides(), - offGrid); - break; - case AF_INTERP_LINEAR: - approx2_ - (out.get(), out.dims(), out.elements(), - in.get(), in.dims(), in.elements(), - pos0.get(), pos0.dims(), pos1.get(), pos1.dims(), - out.strides(), in.strides(), pos0.strides(), pos1.strides(), - offGrid); - break; - default: - break; - } - return out; +template +Array approx2(const Array &in, const Array &pos0, const Array &pos1, + const af_interp_type method, const float offGrid) +{ + in.eval(); + pos0.eval(); + pos1.eval(); + + af::dim4 odims = in.dims(); + odims[0] = pos0.dims()[0]; + odims[1] = pos0.dims()[1]; + + Array out = createEmptyArray(odims); + + switch(method) { + case AF_INTERP_NEAREST: + getQueue().enqueue(kernel::approx2, + out, in, pos0, pos1, offGrid); + break; + case AF_INTERP_LINEAR: + getQueue().enqueue(kernel::approx2, + out, in, pos0, pos1, offGrid); + break; + default: + break; } + return out; +} #define INSTANTIATE(Ty, Tp) \ template Array approx1(const Array &in, const Array &pos, \ @@ -340,8 +81,9 @@ namespace cpu const Array &pos1, const af_interp_type method, \ const float offGrid); \ - INSTANTIATE(float , float ) - INSTANTIATE(double , double) - INSTANTIATE(cfloat , float ) - INSTANTIATE(cdouble, double) +INSTANTIATE(float , float ) +INSTANTIATE(double , double) +INSTANTIATE(cfloat , float ) +INSTANTIATE(cdouble, double) + } diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp index 623bd52ac7..463b30c733 100644 --- a/src/backend/cpu/assign.cpp +++ b/src/backend/cpu/assign.cpp @@ -12,34 +12,26 @@ #include #include #include +#include #include -#include - -using af::dim4; +#include +#include namespace cpu { -static inline -dim_t trimIndex(int idx, const dim_t &len) -{ - int ret_val = idx; - int offset = abs(ret_val)%len; - if (ret_val<0) { - ret_val = offset-1; - } else if (ret_val>=(int)len) { - ret_val = len-offset-1; - } - return ret_val; -} +using af::dim4; +using std::vector; template void assign(Array& out, const af_index_t idxrs[], const Array& rhs) { - bool isSeq[4]; - std::vector seqs(4, af_span); - // create seq vector to retrieve output - // dimensions, offsets & offsets + out.eval(); + rhs.eval(); + + vector isSeq(4); + vector seqs(4, af_span); + // create seq vector to retrieve output dimensions, offsets & offsets for (dim_t x=0; x<4; ++x) { if (idxrs[x].isSeq) { seqs[x] = idxrs[x].idx.seq; @@ -47,68 +39,17 @@ void assign(Array& out, const af_index_t idxrs[], const Array& rhs) isSeq[x] = idxrs[x].isSeq; } - dim4 dDims = out.getDataDims(); - dim4 pDims = out.dims(); - // retrieve dimensions & strides for array - // to which rhs is being copied to - dim4 dst_offsets = toOffset(seqs, dDims); - dim4 dst_strides = toStride(seqs, dDims); - // retrieve rhs array dimenesions & strides - dim4 src_dims = rhs.dims(); - dim4 src_strides = rhs.strides(); - - std::vector< Array > idxArrs(4, createEmptyArray(dim4())); + vector< Array > idxArrs(4, createEmptyArray(dim4())); // look through indexs to read af_array indexs for (dim_t x=0; x<4; ++x) { if (!isSeq[x]) { idxArrs[x] = castArray(idxrs[x].idx.arr); + idxArrs[x].eval(); } } - // declare pointers to af_array index data - const uint* ptr0 = idxArrs[0].get(); - const uint* ptr1 = idxArrs[1].get(); - const uint* ptr2 = idxArrs[2].get(); - const uint* ptr3 = idxArrs[3].get(); - - const T * src= rhs.get(); - T * dst = out.get(); - - for(dim_t l=0; l, out, rhs, std::move(isSeq), + std::move(seqs), std::move(idxArrs)); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp index 2d1e4dddff..abd985768d 100644 --- a/src/backend/cpu/bilateral.cpp +++ b/src/backend/cpu/bilateral.cpp @@ -11,89 +11,25 @@ #include #include #include +#include #include #include #include +#include +#include using af::dim4; namespace cpu { -static inline dim_t clamp(int a, dim_t mn, dim_t mx) -{ - return (a < (int)mn ? mn : (a > (int)mx ? mx : a)); -} - -static inline unsigned getIdx(const dim4 &strides, - int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i * strides[0]); -} - template Array bilateral(const Array &in, const float &s_sigma, const float &c_sigma) { + in.eval(); const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); - - outType *outData = out.get(); - const inType * inData = in.get(); - - // clamp spatical and chromatic sigma's - float space_ = std::min(11.5f, std::max(s_sigma, 0.f)); - float color_ = std::max(c_sigma, 0.f); - const dim_t radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1); - const float svar = space_*space_; - const float cvar = color_*color_; - - for(dim_t b3=0; b3, out, in, s_sigma, c_sigma); return out; } diff --git a/src/backend/cpu/blas.cpp b/src/backend/cpu/blas.cpp index d01998bafb..3ecb502ffa 100644 --- a/src/backend/cpu/blas.cpp +++ b/src/backend/cpu/blas.cpp @@ -11,18 +11,20 @@ #include #include #include -#include #include +#include +#include +#include namespace cpu { - using std::add_const; - using std::add_pointer; - using std::enable_if; - using std::is_floating_point; - using std::remove_const; - using std::conditional; +using std::add_const; +using std::add_pointer; +using std::enable_if; +using std::is_floating_point; +using std::remove_const; +using std::conditional; // Some implementations of BLAS require void* for complex pointers while others use float*/double* // @@ -145,6 +147,9 @@ template Array matmul(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { + lhs.eval(); + rhs.eval(); + CBLAS_TRANSPOSE lOpts = toCblasTranspose(optLhs); CBLAS_TRANSPOSE rOpts = toCblasTranspose(optRhs); @@ -158,77 +163,60 @@ Array matmul(const Array &lhs, const Array &rhs, int N = rDims[bColDim]; int K = lDims[aColDim]; - //FIXME: Leaks on errors. - Array out = createEmptyArray(af::dim4(M, N, 1, 1)); - auto alpha = getScale(); - auto beta = getScale(); - - dim4 lStrides = lhs.strides(); - dim4 rStrides = rhs.strides(); using BT = typename blas_base::type; using CBT = const typename blas_base::type; - if(rDims[bColDim] == 1) { - N = lDims[aColDim]; - gemv_func()( - CblasColMajor, lOpts, - lDims[0], lDims[1], - alpha, - reinterpret_cast(lhs.get()), lStrides[1], - reinterpret_cast(rhs.get()), rStrides[0], - beta, - reinterpret_cast(out.get()), 1); - } else { - gemm_func()( - CblasColMajor, lOpts, rOpts, - M, N, K, - alpha, - reinterpret_cast(lhs.get()), lStrides[1], - reinterpret_cast(rhs.get()), rStrides[1], - beta, - reinterpret_cast(out.get()), out.dims()[0]); - } + Array out = createEmptyArray(af::dim4(M, N, 1, 1)); + auto func = [=] (Array output, const Array left, const Array right) { + auto alpha = getScale(); + auto beta = getScale(); + + dim4 lStrides = left.strides(); + dim4 rStrides = right.strides(); + + if(rDims[bColDim] == 1) { + gemv_func()( + CblasColMajor, lOpts, + lDims[0], lDims[1], + alpha, + reinterpret_cast(left.get()), lStrides[1], + reinterpret_cast(right.get()), rStrides[0], + beta, + reinterpret_cast(output.get()), 1); + } else { + gemm_func()( + CblasColMajor, lOpts, rOpts, + M, N, K, + alpha, + reinterpret_cast(left.get()), lStrides[1], + reinterpret_cast(right.get()), rStrides[1], + beta, + reinterpret_cast(output.get()), output.dims()[0]); + } + }; + getQueue().enqueue(func, out, lhs, rhs); return out; } -template T -conj(T x) { return x; } - -template<> cfloat conj (cfloat c) { return std::conj(c); } -template<> cdouble conj(cdouble c) { return std::conj(c); } - -template -Array dot_(const Array &lhs, const Array &rhs, - af_mat_prop optLhs, af_mat_prop optRhs) -{ - int N = lhs.dims()[0]; - - T out = 0; - const T *pL = lhs.get(); - const T *pR = rhs.get(); - - for(int i = 0; i < N; i++) - out += (conjugate ? cpu::conj(pL[i]) : pL[i]) * pR[i]; - - if(both_conjugate) out = cpu::conj(out); - - return createValueArray(af::dim4(1), out); -} - template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { + lhs.eval(); + rhs.eval(); + + Array out = createEmptyArray(af::dim4(1)); if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - return dot_(lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot, out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - return dot_(lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot,out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - return dot_(rhs, lhs, optRhs, optLhs); + getQueue().enqueue(kernel::dot,out, rhs, lhs, optRhs, optLhs); } else { - return dot_(lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot,out, lhs, rhs, optLhs, optRhs); } + return out; } #undef BT diff --git a/src/backend/cpu/blas.hpp b/src/backend/cpu/blas.hpp index 117d3a2145..3f5b7451ad 100644 --- a/src/backend/cpu/blas.hpp +++ b/src/backend/cpu/blas.hpp @@ -10,17 +10,18 @@ #include #include #include +#include -#ifdef __APPLE__ -#include -#else #ifdef USE_MKL -#include + #include #else -extern "C" { -#include -} -#endif + #ifdef __APPLE__ + #include + #else + extern "C" { + #include + } + #endif #endif // TODO: Ask upstream for a more official way to detect it diff --git a/src/backend/cpu/cholesky.cpp b/src/backend/cpu/cholesky.cpp index 57beaa4146..5e393f0082 100644 --- a/src/backend/cpu/cholesky.cpp +++ b/src/backend/cpu/cholesky.cpp @@ -18,8 +18,9 @@ #include #include #include - #include +#include +#include namespace cpu { @@ -46,6 +47,8 @@ CH_FUNC(potrf , cdouble, z) template Array cholesky(int *info, const Array &in, const bool is_upper) { + in.eval(); + Array out = copyArray(in); *info = cholesky_inplace(out, is_upper); @@ -58,6 +61,8 @@ Array cholesky(int *info, const Array &in, const bool is_upper) template int cholesky_inplace(Array &in, const bool is_upper) { + in.eval(); + dim4 iDims = in.dims(); int N = iDims[0]; @@ -65,8 +70,13 @@ int cholesky_inplace(Array &in, const bool is_upper) if(is_upper) uplo = 'U'; - int info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, - N, in.get(), in.strides()[1]); + int info = 0; + auto func = [&] (int& info, Array& in) { + info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, N, in.get(), in.strides()[1]); + }; + + getQueue().enqueue(func, info, in); + getQueue().sync(); return info; } diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp index 77d7daa5cd..8218a3f9a3 100644 --- a/src/backend/cpu/convolve.cpp +++ b/src/backend/cpu/convolve.cpp @@ -14,176 +14,23 @@ #include #include #include +#include +#include +#include using af::dim4; namespace cpu { -template -void one2one_1d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &sStrides) -{ - dim_t start = (expand ? 0 : fDims[0]/2); - dim_t end = (expand ? oDims[0] : start + sDims[0]); - for(dim_t i=start; i=0 &&iIdx -void one2one_2d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides, - dim4 const &sStrides, dim4 const &fStrides) -{ - dim_t jStart = (expand ? 0 : fDims[1]/2); - dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]); - dim_t iStart = (expand ? 0 : fDims[0]/2); - dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]); - - for(dim_t j=jStart; j=0 && jIdx=0 && iIdx -void one2one_3d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides, - dim4 const &sStrides, dim4 const &fStrides) -{ - dim_t kStart = (expand ? 0 : fDims[2]/2); - dim_t kEnd = (expand ? oDims[2] : kStart + sDims[2]); - dim_t jStart = (expand ? 0 : fDims[1]/2); - dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]); - dim_t iStart = (expand ? 0 : fDims[0]/2); - dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]); - - for(dim_t k=kStart; k=0 && kIdx=0 && jIdx=0 && iIdx -void convolve_nd(T *optr, T const *iptr, accT const *fptr, - dim4 const &oDims, dim4 const &sDims, dim4 const &fDims, - dim4 const &oStrides, dim4 const &sStrides, dim4 const &fStrides, - ConvolveBatchKind kind) -{ - dim_t out_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ - dim_t in_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ - dim_t filt_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ - dim_t batch[4] = {0, 1, 1, 1}; /* first value is never used, and declared for code simplicity */ - - for (dim_t i=1; i<4; ++i) { - switch(kind) { - case CONVOLVE_BATCH_SIGNAL: - out_step[i] = oStrides[i]; - in_step[i] = sStrides[i]; - if (i>=baseDim) batch[i] = sDims[i]; - break; - case CONVOLVE_BATCH_SAME: - out_step[i] = oStrides[i]; - in_step[i] = sStrides[i]; - filt_step[i] = fStrides[i]; - if (i>=baseDim) batch[i] = sDims[i]; - break; - case CONVOLVE_BATCH_KERNEL: - out_step[i] = oStrides[i]; - filt_step[i] = fStrides[i]; - if (i>=baseDim) batch[i] = fDims[i]; - break; - default: - break; - } - } - - for (dim_t b3=0; b3(out, in, filt, oDims, sDims, fDims, sStrides); break; - case 2: one2one_2d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; - case 3: one2one_3d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; - } - } - } - } -} - template Array convolve(Array const& signal, Array const& filter, ConvolveBatchKind kind) { + signal.eval(); + filter.eval(); + auto sDims = signal.dims(); auto fDims = filter.dims(); - auto sStrides = signal.strides(); dim4 oDims(1); if (expand) { @@ -204,99 +51,37 @@ Array convolve(Array const& signal, Array const& filter, ConvolveBat Array out = createEmptyArray(oDims); - convolve_nd(out.get(), signal.get(), filter.get(), - oDims, sDims, fDims, out.strides(), sStrides, filter.strides(), kind); + getQueue().enqueue(kernel::convolve_nd,out, signal, filter, kind); return out; } -template -void convolve2_separable(T *optr, T const *iptr, accT const *fptr, - dim4 const &oDims, dim4 const &sDims, dim4 const &orgDims, dim_t fDim, - dim4 const &oStrides, dim4 const &sStrides, dim_t fStride) -{ - for(dim_t j=0; j>1); - - for(dim_t i=0; i>1); - - accT accum = scalar(0); - - for(dim_t f=0; f=0 && offi=0 && cj(0)); - } else { - dim_t offj = cj - f; - bool isCIValid = ci>=0 && ci=0 && offj(0)); - } - - accum += accT(s_val * f_val); - } - optr[iOff+jOff] = T(accum); - } - } -} - template Array convolve2(Array const& signal, Array const& c_filter, Array const& r_filter) { - auto sDims = signal.dims(); - auto cfDims = c_filter.dims(); - auto rfDims = r_filter.dims(); - auto sStrides = signal.strides(); - - dim_t cflen = (dim_t)cfDims.elements(); - dim_t rflen = (dim_t)rfDims.elements(); + signal.eval(); + c_filter.eval(); + r_filter.eval(); + auto sDims = signal.dims(); dim4 tDims = sDims; dim4 oDims = sDims; if (expand) { + auto cfDims = c_filter.dims(); + auto rfDims = r_filter.dims(); + + dim_t cflen = (dim_t)cfDims.elements(); + dim_t rflen = (dim_t)rfDims.elements(); // separable convolve only does CONVOLVE_BATCH_NONE and standard batch(CONVOLVE_BATCH_SIGNAL) tDims[0] += cflen - 1; oDims[0] += cflen - 1; oDims[1] += rflen - 1; } - Array temp = createEmptyArray(tDims); Array out = createEmptyArray(oDims); - auto tStrides = temp.strides(); - auto oStrides = out.strides(); - - for (dim_t b3=0; b3(tptr, iptr, c_filter.get(), - tDims, sDims, sDims, cflen, - tStrides, sStrides, c_filter.strides()[0]); - - convolve2_separable(optr, tptr, r_filter.get(), - oDims, tDims, sDims, rflen, - oStrides, tStrides, r_filter.strides()[0]); - } - } + getQueue().enqueue(kernel::convolve2, out, signal, c_filter, r_filter, tDims); return out; } diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 87e4480a36..27e80f8afb 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -18,139 +18,79 @@ #include #include #include +#include +#include +#include namespace cpu { - template - static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim) - { - if(dim == 0) { - if(strides[dim] == 1) { - //FIXME: Check for errors / exceptions - memcpy(dst, src, dims[dim] * sizeof(T)); - } else { - for(dim_t i = 0; i < dims[dim]; i++) { - dst[i] = src[strides[dim]*i]; - } - } - } else { - for(dim_t i = dims[dim]; i > 0; i--) { - stridedCopy(dst, ostrides, src, dims, strides, dim - 1); - src += strides[dim]; - dst += ostrides[dim]; - } - } - } - - // Assigns to single elements - template - void copyData(T *to, const Array &from) - { - if(from.isOwner()) { - // FIXME: Check for errors / exceptions - memcpy(to, from.get(), from.elements()*sizeof(T)); - } else { - dim4 ostrides = calcStrides(from.dims()); - stridedCopy(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1); - } - } - - template - Array copyArray(const Array &A) - { - Array out = createEmptyArray(A.dims()); - copyData(out.get(), A); - return out; - } - template - static void copy(Array &dst, const Array &src, outType default_value, double factor) - { - dim4 src_dims = src.dims(); - dim4 dst_dims = dst.dims(); - dim4 src_strides = src.strides(); - dim4 dst_strides = dst.strides(); - - const inType * src_ptr = src.get(); - outType * dst_ptr = dst.get(); - - dim_t trgt_l = std::min(dst_dims[3], src_dims[3]); - dim_t trgt_k = std::min(dst_dims[2], src_dims[2]); - dim_t trgt_j = std::min(dst_dims[1], src_dims[1]); - dim_t trgt_i = std::min(dst_dims[0], src_dims[0]); - - for(dim_t l=0; l +void copyData(T *to, const Array &from) +{ + from.eval(); + getQueue().sync(); + if(from.isLinear()) { + // FIXME: Check for errors / exceptions + memcpy(to, from.get(), from.elements()*sizeof(T)); + } else { + dim4 ostrides = calcStrides(from.dims()); + kernel::stridedCopy(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1); } +} - template - void multiply_inplace(Array &in, double val) - { - copy(in, in, 0, val); - } +template +Array copyArray(const Array &A) +{ + A.eval(); + Array out = createEmptyArray(A.dims()); + getQueue().enqueue(kernel::copy, out, A, scalar(0), 1.0); + return out; +} - template - Array - padArray(Array const &in, dim4 const &dims, - outType default_value, double factor) - { - Array ret = createValueArray(dims, default_value); - copy(ret, in, outType(default_value), factor); - return ret; - } +template +void multiply_inplace(Array &in, double val) +{ + in.eval(); + getQueue().enqueue(kernel::copy, in, in, 0, val); +} - template - void copyArray(Array &out, Array const &in) - { - copy(out, in, scalar(0), 1.0); - } +template +Array padArray(Array const &in, dim4 const &dims, + outType default_value, double factor) +{ + Array ret = createValueArray(dims, default_value); + ret.eval(); + in.eval(); + getQueue().enqueue(kernel::copy, ret, in, outType(default_value), factor); + return ret; +} +template +void copyArray(Array &out, Array const &in) +{ + out.eval(); + in.eval(); + getQueue().enqueue(kernel::copy, out, in, scalar(0), 1.0); +} #define INSTANTIATE(T) \ template void copyData (T *data, const Array &from); \ template Array copyArray(const Array &A); \ template void multiply_inplace (Array &in, double norm); \ - INSTANTIATE(float ) - INSTANTIATE(double ) - INSTANTIATE(cfloat ) - INSTANTIATE(cdouble) - INSTANTIATE(int ) - INSTANTIATE(uint ) - INSTANTIATE(uchar ) - INSTANTIATE(char ) - INSTANTIATE(intl ) - INSTANTIATE(uintl ) - INSTANTIATE(short ) - INSTANTIATE(ushort ) +INSTANTIATE(float ) +INSTANTIATE(double ) +INSTANTIATE(cfloat ) +INSTANTIATE(cdouble) +INSTANTIATE(int ) +INSTANTIATE(uint ) +INSTANTIATE(uchar ) +INSTANTIATE(char ) +INSTANTIATE(intl ) +INSTANTIATE(uintl ) +INSTANTIATE(short ) +INSTANTIATE(ushort ) #define INSTANTIATE_PAD_ARRAY(SRC_T) \ @@ -179,16 +119,16 @@ namespace cpu template void copyArray(Array &dst, Array const &src); \ template void copyArray(Array &dst, Array const &src); - INSTANTIATE_PAD_ARRAY(float ) - INSTANTIATE_PAD_ARRAY(double) - INSTANTIATE_PAD_ARRAY(int ) - INSTANTIATE_PAD_ARRAY(uint ) - INSTANTIATE_PAD_ARRAY(intl ) - INSTANTIATE_PAD_ARRAY(uintl ) - INSTANTIATE_PAD_ARRAY(uchar ) - INSTANTIATE_PAD_ARRAY(char ) - INSTANTIATE_PAD_ARRAY(ushort) - INSTANTIATE_PAD_ARRAY(short ) +INSTANTIATE_PAD_ARRAY(float ) +INSTANTIATE_PAD_ARRAY(double) +INSTANTIATE_PAD_ARRAY(int ) +INSTANTIATE_PAD_ARRAY(uint ) +INSTANTIATE_PAD_ARRAY(intl ) +INSTANTIATE_PAD_ARRAY(uintl ) +INSTANTIATE_PAD_ARRAY(uchar ) +INSTANTIATE_PAD_ARRAY(char ) +INSTANTIATE_PAD_ARRAY(ushort) +INSTANTIATE_PAD_ARRAY(short ) #define INSTANTIATE_PAD_ARRAY_COMPLEX(SRC_T) \ template Array padArray(Array const &src, dim4 const &dims, cfloat default_value, double factor); \ @@ -196,8 +136,8 @@ namespace cpu template void copyArray(Array &dst, Array const &src); \ template void copyArray(Array &dst, Array const &src); - INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat ) - INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble) +INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat ) +INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble) #define SPECILIAZE_UNUSED_COPYARRAY(SRC_T, DST_T) \ template<> void copyArray(Array &out, Array const &in) \ @@ -205,25 +145,25 @@ namespace cpu CPU_NOT_SUPPORTED();\ } - SPECILIAZE_UNUSED_COPYARRAY(cfloat , double) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , float) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , char) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , int) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , short) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, double) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, float) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, char) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, int) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, short) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , double) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , float) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , char) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , int) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , short) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, double) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, float) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, char) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, int) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, short) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort) } diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp index d949a24437..c818f82795 100644 --- a/src/backend/cpu/diagonal.cpp +++ b/src/backend/cpu/diagonal.cpp @@ -10,83 +10,61 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include namespace cpu { - template - Array diagCreate(const Array &in, const int num) - { - int size = in.dims()[0] + std::abs(num); - int batch = in.dims()[1]; - Array out = createEmptyArray(dim4(size, size, batch)); - const T *iptr = in.get(); - T *optr = out.get(); - - for (int k = 0; k < batch; k++) { - for (int j = 0; j < size; j++) { - for (int i = 0; i < size; i++) { - T val = scalar(0); - if (i == j - num) { - val = (num > 0) ? iptr[i] : iptr[j]; - } - optr[i + j * out.strides()[1]] = val; - } - } - optr += out.strides()[2]; - iptr += in.strides()[1]; - } - - return out; - } +template +Array diagCreate(const Array &in, const int num) +{ + in.eval(); - template - Array diagExtract(const Array &in, const int num) - { - const dim_t *idims = in.dims().get(); - dim_t size = std::max(idims[0], idims[1]) - std::abs(num); - Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); + int size = in.dims()[0] + std::abs(num); + int batch = in.dims()[1]; + Array out = createEmptyArray(dim4(size, size, batch)); - const dim_t *odims = out.dims().get(); + getQueue().enqueue(kernel::diagCreate, out, in, num); - const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); + return out; +} - for (int l = 0; l < (int)odims[3]; l++) { +template +Array diagExtract(const Array &in, const int num) +{ + in.eval(); - for (int k = 0; k < (int)odims[2]; k++) { - const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; - T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; + const dim4 idims = in.dims(); + dim_t size = std::max(idims[0], idims[1]) - std::abs(num); + Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); - for (int i = 0; i < (int)odims[0]; i++) { - T val = scalar(0); - if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; - optr[i] = val; - } - } - } + getQueue().enqueue(kernel::diagExtract, out, in, num); - return out; - } + return out; +} #define INSTANTIATE_DIAGONAL(T) \ template Array diagExtract (const Array &in, const int num); \ template Array diagCreate (const Array &in, const int num); - INSTANTIATE_DIAGONAL(float) - INSTANTIATE_DIAGONAL(double) - INSTANTIATE_DIAGONAL(cfloat) - INSTANTIATE_DIAGONAL(cdouble) - INSTANTIATE_DIAGONAL(int) - INSTANTIATE_DIAGONAL(uint) - INSTANTIATE_DIAGONAL(intl) - INSTANTIATE_DIAGONAL(uintl) - INSTANTIATE_DIAGONAL(char) - INSTANTIATE_DIAGONAL(uchar) - INSTANTIATE_DIAGONAL(short) - INSTANTIATE_DIAGONAL(ushort) +INSTANTIATE_DIAGONAL(float) +INSTANTIATE_DIAGONAL(double) +INSTANTIATE_DIAGONAL(cfloat) +INSTANTIATE_DIAGONAL(cdouble) +INSTANTIATE_DIAGONAL(int) +INSTANTIATE_DIAGONAL(uint) +INSTANTIATE_DIAGONAL(intl) +INSTANTIATE_DIAGONAL(uintl) +INSTANTIATE_DIAGONAL(char) +INSTANTIATE_DIAGONAL(uchar) +INSTANTIATE_DIAGONAL(short) +INSTANTIATE_DIAGONAL(ushort) } diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp index 063a761baf..1e374e95da 100644 --- a/src/backend/cpu/diff.cpp +++ b/src/backend/cpu/diff.cpp @@ -9,117 +9,60 @@ #include #include -#include -#include +#include +#include +#include namespace cpu { - unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0) - { - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i); - } - template - Array diff1(const Array &in, const int dim) - { - // Bool for dimension - bool is_dim0 = dim == 0; - bool is_dim1 = dim == 1; - bool is_dim2 = dim == 2; - bool is_dim3 = dim == 3; - - // Decrement dimension of select dimension - af::dim4 dims = in.dims(); - dims[dim]--; - - // Create output placeholder - Array outArray = createValueArray(dims, (T)0); +template +Array diff1(const Array &in, const int dim) +{ + in.eval(); - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); + // Decrement dimension of select dimension + af::dim4 dims = in.dims(); + dims[dim]--; - // TODO: Improve this - for(dim_t l = 0; l < dims[3]; l++) { - for(dim_t k = 0; k < dims[2]; k++) { - for(dim_t j = 0; j < dims[1]; j++) { - for(dim_t i = 0; i < dims[0]; i++) { - // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), - i + is_dim0, j + is_dim1, - k + is_dim2, l + is_dim3); - int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); - outPtr[odx] = inPtr[jdx] - inPtr[idx]; - } - } - } - } + Array outArray = createEmptyArray(dims); - return outArray; - } + getQueue().enqueue(kernel::diff1, outArray, in, dim); - template - Array diff2(const Array &in, const int dim) - { - // Bool for dimension - bool is_dim0 = dim == 0; - bool is_dim1 = dim == 1; - bool is_dim2 = dim == 2; - bool is_dim3 = dim == 3; + return outArray; +} - // Decrement dimension of select dimension - af::dim4 dims = in.dims(); - dims[dim] -= 2; +template +Array diff2(const Array &in, const int dim) +{ + in.eval(); - // Create output placeholder - Array outArray = createValueArray(dims, (T)0); + // Decrement dimension of select dimension + af::dim4 dims = in.dims(); + dims[dim] -= 2; - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); + Array outArray = createEmptyArray(dims); - // TODO: Improve this - for(dim_t l = 0; l < dims[3]; l++) { - for(dim_t k = 0; k < dims[2]; k++) { - for(dim_t j = 0; j < dims[1]; j++) { - for(dim_t i = 0; i < dims[0]; i++) { - // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), - i + is_dim0, j + is_dim1, - k + is_dim2, l + is_dim3); - int kdx = getIdx(in.strides(), in.offsets(), - i + 2 * is_dim0, j + 2 * is_dim1, - k + 2 * is_dim2, l + 2 * is_dim3); - int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); - outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; - } - } - } - } + getQueue().enqueue(kernel::diff2, outArray, in, dim); - return outArray; - } + return outArray; +} #define INSTANTIATE(T) \ template Array diff1 (const Array &in, const int dim); \ template Array diff2 (const Array &in, const int dim); \ +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(ushort) +INSTANTIATE(short) - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(ushort) - INSTANTIATE(short) } diff --git a/src/backend/cpu/exampleFunction.cpp b/src/backend/cpu/exampleFunction.cpp index a9e7bca9eb..0eb86462e1 100644 --- a/src/backend/cpu/exampleFunction.cpp +++ b/src/backend/cpu/exampleFunction.cpp @@ -24,6 +24,13 @@ namespace cpu template Array exampleFunction(const Array &in, const af_someenum_t method) { + in.eval(); // All input Arrays should call eval mandatorily + // in CPU backend function implementations. Since + // the cpu fns are asynchronous launches, any Arrays + // that are either views/JIT nodes needs to evaluated + // before they are passed onto functions that are + // enqueued onto the queues. + dim4 outputDims; // this should be '= in.dims();' in most cases // but would definitely depend on the type of // algorithm you are implementing. @@ -37,7 +44,7 @@ Array exampleFunction(const Array &in, const af_someenum_t method) //dim4 in_dims = in.dims(); // you can retrieve dimensions - //dim4 in_offsets = in.offsets(); // you can retrieve offsets - used when given array + //dim_t in_offset = in.getOffset(); // you can retrieve the offset - used when given array // is an sub-array pointing to some other array and // doesn't have memory of its own @@ -70,4 +77,3 @@ INSTANTIATE(cfloat) INSTANTIATE(cdouble) } - diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp index 1c8069c24d..954f457cf4 100644 --- a/src/backend/cpu/fast.cpp +++ b/src/backend/cpu/fast.cpp @@ -14,240 +14,23 @@ #include #include #include +#include +#include +#include using af::dim4; namespace cpu { -inline int clamp(int f, int a, int b) -{ - return std::max(a, std::min(f, b)); -} - -inline int idx_y(int i) -{ - if (i >= 8) - return clamp(-(i-8-4), -3, 3); - - return clamp(i-4, -3, 3); -} - -inline int idx_x(int i) -{ - if (i < 12) - return idx_y(i+4); - - return idx_y(i-12); -} - -inline int idx(int y, int x, unsigned idim0) -{ - return x * idim0 + y; -} - -// test_greater() -// Tests if a pixel x > p + thr -inline int test_greater(float x, float p, float thr) -{ - return (x >= p + thr); -} - -// test_smaller() -// Tests if a pixel x < p - thr -inline int test_smaller(float x, float p, float thr) -{ - return (x <= p - thr); -} - -// test_pixel() -// Returns -1 when x < p - thr -// Returns 0 when x >= p - thr && x <= p + thr -// Returns 1 when x > p + thr -template -inline int test_pixel(const T* image, const float p, float thr, int y, int x, unsigned idim0) -{ - return -test_smaller((float)image[idx(y,x,idim0)], p, thr) | test_greater((float)image[idx(y,x,idim0)], p, thr); -} - -// abs_diff() -// Returns absolute difference of x and y -inline int abs_diff(int x, int y) -{ - return abs(x - y); -} -inline unsigned abs_diff(unsigned x, unsigned y) -{ - return (unsigned)abs((int)x - (int)y); -} -inline float abs_diff(float x, float y) -{ - return fabs(x - y); -} -inline double abs_diff(double x, double y) -{ - return fabs(x - y); -} - -template -void locate_features( - const Array &in, - Array &score, - Array &x_out, - Array &y_out, - Array &score_out, - unsigned* count, - const float thr, - const unsigned arc_length, - const unsigned nonmax, - const unsigned max_feat, - const unsigned edge) -{ - dim4 in_dims = in.dims(); - const T* in_ptr = in.get(); - - for (int y = edge; y < (int)(in_dims[0] - edge); y++) { - for (int x = edge; x < (int)(in_dims[1] - edge); x++) { - float p = in_ptr[idx(y, x, in_dims[0])]; - - // Start by testing opposite pixels of the circle that will result in - // a non-kepoint - int d; - d = test_pixel(in_ptr, p, thr, y-3, x, in_dims[0]) | test_pixel(in_ptr, p, thr, y+3, x, in_dims[0]); - if (d == 0) - continue; - - d &= test_pixel(in_ptr, p, thr, y-2, x+2, in_dims[0]) | test_pixel(in_ptr, p, thr, y+2, x-2, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y , x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y , x-3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y+2, x+2, in_dims[0]) | test_pixel(in_ptr, p, thr, y-2, x-2, in_dims[0]); - if (d == 0) - continue; - - d &= test_pixel(in_ptr, p, thr, y-3, x+1, in_dims[0]) | test_pixel(in_ptr, p, thr, y+3, x-1, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y-1, x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y+1, x-3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y+1, x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y-1, x-3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y+3, x+1, in_dims[0]) | test_pixel(in_ptr, p, thr, y-3, x-1, in_dims[0]); - if (d == 0) - continue; - - int sum = 0; - - // Sum responses [-1, 0 or 1] of first arc_length pixels - for (int i = 0; i < static_cast(arc_length); i++) - sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); - - // Test maximum and mininmum responses of first segment of arc_length - // pixels - int max_sum = 0, min_sum = 0; - max_sum = std::max(max_sum, sum); - min_sum = std::min(min_sum, sum); - - // Sum responses and test the remaining 16-arc_length pixels of the circle - for (int i = arc_length; i < 16; i++) { - sum -= test_pixel(in_ptr, p, thr, y+idx_y(i-arc_length), x+idx_x(i-arc_length), in_dims[0]); - sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); - max_sum = std::max(max_sum, sum); - min_sum = std::min(min_sum, sum); - } - - // To completely test all possible segments, it's necessary to test - // segments that include the top junction of the circle - for (int i = 0; i < static_cast(arc_length-1); i++) { - sum -= test_pixel(in_ptr, p, thr, y+idx_y(16-arc_length+i), x+idx_x(16-arc_length+i), in_dims[0]); - sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); - max_sum = std::max(max_sum, sum); - min_sum = std::min(min_sum, sum); - } - - float s_bright = 0, s_dark = 0; - for (int i = 0; i < 16; i++) { - float p_x = (float)in_ptr[idx(y+idx_y(i), x+idx_x(i), in_dims[0])]; - - s_bright += test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr); - s_dark += test_smaller(p_x, p, thr) * (abs_diff(p, p_x) - thr); - } - - // If sum at some point was equal to (+-)arc_length, there is a segment - // that for which all pixels are much brighter or much brighter than - // central pixel p. - if (max_sum == static_cast(arc_length) || min_sum == -static_cast(arc_length)) { - unsigned j = *count; - ++*count; - if (j < max_feat) { - float *x_out_ptr = x_out.get(); - float *y_out_ptr = y_out.get(); - float *score_out_ptr = score_out.get(); - x_out_ptr[j] = static_cast(x); - y_out_ptr[j] = static_cast(y); - score_out_ptr[j] = static_cast(std::max(s_bright, s_dark)); - if (nonmax == 1) { - float* score_ptr = score.get(); - score_ptr[idx(y, x, in_dims[0])] = std::max(s_bright, s_dark); - } - } - } - } - } -} - -void non_maximal( - const Array &score, - const Array &x_in, - const Array &y_in, - Array &x_out, - Array &y_out, - Array &score_out, - unsigned* count, - const unsigned total_feat, - const unsigned edge) -{ - const float *score_ptr = score.get(); - const float *x_in_ptr = x_in.get(); - const float *y_in_ptr = y_in.get(); - - dim4 score_dims = score.dims(); - - for (unsigned k = 0; k < total_feat; k++) { - unsigned x = static_cast(round(x_in_ptr[k])); - unsigned y = static_cast(round(y_in_ptr[k])); - - float v = score_ptr[y + score_dims[0] * x]; - float max_v; - max_v = std::max(score_ptr[y-1 + score_dims[0] * (x-1)], score_ptr[y-1 + score_dims[0] * x]); - max_v = std::max(max_v, score_ptr[y-1 + score_dims[0] * (x+1)]); - max_v = std::max(max_v, score_ptr[y + score_dims[0] * (x-1)]); - max_v = std::max(max_v, score_ptr[y + score_dims[0] * (x+1)]); - max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x-1)]); - max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x) ]); - max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x+1)]); - - if (y >= score_dims[1] - edge - 1 || y <= edge + 1 || - x >= score_dims[0] - edge - 1 || x <= edge + 1) - continue; - - // Stores keypoint to feat_out if it's response is maximum compared to - // its 8-neighborhood - if (v > max_v) { - unsigned j = *count; - ++*count; - - float *x_out_ptr = x_out.get(); - float *y_out_ptr = y_out.get(); - float *score_out_ptr = score_out.get(); - - x_out_ptr[j] = static_cast(x); - y_out_ptr[j] = static_cast(y); - score_out_ptr[j] = static_cast(v); - } - } -} - template unsigned fast(Array &x_out, Array &y_out, Array &score_out, const Array &in, const float thr, const unsigned arc_length, const bool nonmax, const float feature_ratio, const unsigned edge) { + in.eval(); + dim4 in_dims = in.dims(); const unsigned max_feat = ceil(in.elements() * feature_ratio); @@ -257,7 +40,9 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, if (nonmax == 1) { dim4 V_dims(in_dims[0], in_dims[1]); V = createValueArray(V_dims, (float)0); + V.eval(); } + getQueue().sync(); // Arrays containing all features detected before non-maximal suppression. dim4 max_feat_dims(max_feat); @@ -268,7 +53,7 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, // Feature counter unsigned count = 0; - locate_features(in, V, x, y, score, &count, thr, arc_length, + kernel::locate_features(in, V, x, y, score, &count, thr, arc_length, nonmax, max_feat, edge); // If more features than max_feat were detected, feat wasn't populated @@ -282,13 +67,12 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, Array score_total = createEmptyArray(af::dim4()); if (nonmax == 1) { - x_total = createEmptyArray(feat_found_dims); y_total = createEmptyArray(feat_found_dims); score_total = createEmptyArray(feat_found_dims); count = 0; - non_maximal(V, x, y, + kernel::non_maximal(V, x, y, x_total, y_total, score_total, &count, feat_found, edge); diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp index e41c8a1658..3c1d10a4f3 100644 --- a/src/backend/cpu/fft.cpp +++ b/src/backend/cpu/fft.cpp @@ -12,151 +12,34 @@ #include #include #include -#include -#include +#include #include #include +#include +#include using af::dim4; namespace cpu { -template -void computeDims(int rdims[rank], const dim4 &idims) -{ - for (int i = 0; i < rank; i++) { - rdims[i] = idims[(rank -1) - i]; - } -} - -template -struct fftw_transform; - -#define TRANSFORM(PRE, TY) \ - template<> \ - struct fftw_transform \ - { \ - typedef PRE##_plan plan_t; \ - typedef PRE##_complex ctype_t; \ - \ - template \ - plan_t create(Args... args) \ - { return PRE##_plan_many_dft(args...); } \ - void execute(plan_t plan) { return PRE##_execute(plan); } \ - void destroy(plan_t plan) { return PRE##_destroy_plan(plan); } \ - }; \ - - -TRANSFORM(fftwf, cfloat) -TRANSFORM(fftw, cdouble) - template void fft_inplace(Array &in) { - int t_dims[rank]; - int in_embed[rank]; - - const dim4 idims = in.dims(); - - computeDims(t_dims , idims); - computeDims(in_embed , in.getDataDims()); - - const dim4 istrides = in.strides(); - - typedef typename fftw_transform::ctype_t ctype_t; - typename fftw_transform::plan_t plan; - - fftw_transform transform; - - int batch = 1; - for (int i = rank; i < 4; i++) { - batch *= idims[i]; - } - - plan = transform.create(rank, - t_dims, - (int)batch, - (ctype_t *)in.get(), - in_embed, (int)istrides[0], - (int)istrides[rank], - (ctype_t *)in.get(), - in_embed, (int)istrides[0], - (int)istrides[rank], - direction ? FFTW_FORWARD : FFTW_BACKWARD, - FFTW_ESTIMATE); - - transform.execute(plan); - transform.destroy(plan); + in.eval(); + getQueue().enqueue(kernel::fft_inplace, in); } -template -struct fftw_real_transform; - -#define TRANSFORM_REAL(PRE, To, Ti, POST) \ - template<> \ - struct fftw_real_transform \ - { \ - typedef PRE##_plan plan_t; \ - typedef PRE##_complex ctype_t; \ - \ - template \ - plan_t create(Args... args) \ - { return PRE##_plan_many_dft_##POST(args...); } \ - void execute(plan_t plan) { return PRE##_execute(plan); } \ - void destroy(plan_t plan) { return PRE##_destroy_plan(plan); } \ - }; \ - - -TRANSFORM_REAL(fftwf, cfloat , float , r2c) -TRANSFORM_REAL(fftw , cdouble, double, r2c) -TRANSFORM_REAL(fftwf, float , cfloat , c2r) -TRANSFORM_REAL(fftw , double, cdouble, c2r) - template Array fft_r2c(const Array &in) { - dim4 idims = in.dims(); - dim4 odims = in.dims(); + in.eval(); + dim4 odims = in.dims(); odims[0] = odims[0] / 2 + 1; - Array out = createEmptyArray(odims); - int t_dims[rank]; - int in_embed[rank]; - int out_embed[rank]; - - computeDims(t_dims , idims); - computeDims(in_embed , in.getDataDims()); - computeDims(out_embed , out.getDataDims()); - - const dim4 istrides = in.strides(); - const dim4 ostrides = out.strides(); - - typedef typename fftw_real_transform::ctype_t ctype_t; - typename fftw_real_transform::plan_t plan; - - fftw_real_transform transform; - - int batch = 1; - for (int i = rank; i < 4; i++) { - batch *= idims[i]; - } - - plan = transform.create(rank, - t_dims, - (int)batch, - (Tr *)in.get(), - in_embed, (int)istrides[0], - (int)istrides[rank], - (ctype_t *)out.get(), - out_embed, (int)ostrides[0], - (int)ostrides[rank], - FFTW_ESTIMATE); - - transform.execute(plan); - transform.destroy(plan); + getQueue().enqueue(kernel::fft_r2c, out, in); return out; } @@ -164,42 +47,11 @@ Array fft_r2c(const Array &in) template Array fft_c2r(const Array &in, const dim4 &odims) { - Array out = createEmptyArray(odims); + in.eval(); - int t_dims[rank]; - int in_embed[rank]; - int out_embed[rank]; - - computeDims(t_dims , odims); - computeDims(in_embed , in.getDataDims()); - computeDims(out_embed , out.getDataDims()); - - const dim4 istrides = in.strides(); - const dim4 ostrides = out.strides(); - - typedef typename fftw_real_transform::ctype_t ctype_t; - typename fftw_real_transform::plan_t plan; - - fftw_real_transform transform; - - int batch = 1; - for (int i = rank; i < 4; i++) { - batch *= odims[i]; - } - - plan = transform.create(rank, - t_dims, - (int)batch, - (ctype_t *)in.get(), - in_embed, (int)istrides[0], - (int)istrides[rank], - (Tr *)out.get(), - out_embed, (int)ostrides[0], - (int)ostrides[rank], - FFTW_ESTIMATE); + Array out = createEmptyArray(odims); + getQueue().enqueue(kernel::fft_c2r, out, in, odims); - transform.execute(plan); - transform.destroy(plan); return out; } @@ -211,8 +63,8 @@ Array fft_c2r(const Array &in, const dim4 &odims) template void fft_inplace(Array &in); \ template void fft_inplace(Array &in); - INSTANTIATE(cfloat ) - INSTANTIATE(cdouble) +INSTANTIATE(cfloat ) +INSTANTIATE(cdouble) #define INSTANTIATE_REAL(Tr, Tc) \ template Array fft_r2c(const Array &in); \ @@ -222,6 +74,7 @@ Array fft_c2r(const Array &in, const dim4 &odims) template Array fft_c2r(const Array &in, const dim4 &odims); \ template Array fft_c2r(const Array &in, const dim4 &odims); \ - INSTANTIATE_REAL(float , cfloat ) - INSTANTIATE_REAL(double, cdouble) +INSTANTIATE_REAL(float , cfloat ) +INSTANTIATE_REAL(double, cdouble) + } diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp index f76f3a0d3f..3b4b864452 100644 --- a/src/backend/cpu/fftconvolve.cpp +++ b/src/backend/cpu/fftconvolve.cpp @@ -17,208 +17,20 @@ #include #include #include +#include +#include +#include namespace cpu { -template -void packData(To* out_ptr, const af::dim4& od, const af::dim4& os, - Array const& in) -{ - const af::dim4 id = in.dims(); - const af::dim4 is = in.strides(); - const Ti* in_ptr = in.get(); - - int id0_half = divup(id[0], 2); - bool odd_id0 = (id[0] % 2 == 1); - - for (int d3 = 0; d3 < (int)od[3]; d3++) { - for (int d2 = 0; d2 < (int)od[2]; d2++) { - for (int d1 = 0; d1 < (int)od[1]; d1++) { - for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { - const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - - if (d0 < (int)id0_half && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) { - const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0; - out_ptr[oidx] = (To)in_ptr[iidx]; - if (d0 == id0_half-1 && odd_id0) - out_ptr[oidx+1] = (To)0; - else - out_ptr[oidx+1] = (To)in_ptr[iidx+id0_half]; - } - else { - // Pad remaining elements with 0s - out_ptr[oidx] = (To)0; - out_ptr[oidx+1] = (To)0; - } - } - } - } - } -} - -template -void padArray(To* out_ptr, const af::dim4& od, const af::dim4& os, - Array const& in) -{ - const af::dim4 id = in.dims(); - const af::dim4 is = in.strides(); - const Ti* in_ptr = in.get(); - - for (int d3 = 0; d3 < (int)od[3]; d3++) { - for (int d2 = 0; d2 < (int)od[2]; d2++) { - for (int d1 = 0; d1 < (int)od[1]; d1++) { - for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { - const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - - if (d0 < (int)id[0] && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) { - // Copy input elements to real elements, set imaginary elements to 0 - const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0; - out_ptr[oidx] = (To)in_ptr[iidx]; - out_ptr[oidx+1] = (To)0; - } - else { - // Pad remaining of the matrix to 0s - out_ptr[oidx] = (To)0; - out_ptr[oidx+1] = (To)0; - } - } - } - } - } -} - -template -void complexMultiply(T* out_ptr, const af::dim4& od, const af::dim4& os, - T* in1_ptr, const af::dim4& i1d, const af::dim4& i1s, - T* in2_ptr, const af::dim4& i2d, const af::dim4& i2s, - ConvolveBatchKind kind) -{ - for (int d3 = 0; d3 < (int)od[3]; d3++) { - for (int d2 = 0; d2 < (int)od[2]; d2++) { - for (int d1 = 0; d1 < (int)od[1]; d1++) { - for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { - if (kind == CONVOLVE_BATCH_NONE || kind == CONVOLVE_BATCH_SAME) { - // Complex multiply each signal to equivalent filter - const int ridx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - const int iidx = ridx + 1; - - T a = in1_ptr[ridx]; - T b = in1_ptr[iidx]; - T c = in2_ptr[ridx]; - T d = in2_ptr[iidx]; - - T ac = a*c; - T bd = b*d; - - out_ptr[ridx] = ac - bd; - out_ptr[iidx] = (a+b) * (c+d) - ac - bd; - } - else if (kind == CONVOLVE_BATCH_SIGNAL) { - // Complex multiply all signals to filter - const int ridx1 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - const int iidx1 = ridx1 + 1; - const int ridx2 = ridx1 % (i2s[3] * i2d[3]); - const int iidx2 = iidx1 % (i2s[3] * i2d[3]); - - T a = in1_ptr[ridx1]; - T b = in1_ptr[iidx1]; - T c = in2_ptr[ridx2]; - T d = in2_ptr[iidx2]; - - T ac = a*c; - T bd = b*d; - - out_ptr[ridx1] = ac - bd; - out_ptr[iidx1] = (a+b) * (c+d) - ac - bd; - } - else if (kind == CONVOLVE_BATCH_KERNEL) { - // Complex multiply signal to all filters - const int ridx2 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - const int iidx2 = ridx2 + 1; - const int ridx1 = ridx2 % (i1s[3] * i1d[3]); - const int iidx1 = iidx2 % (i1s[3] * i1d[3]); - - T a = in1_ptr[ridx1]; - T b = in1_ptr[iidx1]; - T c = in2_ptr[ridx2]; - T d = in2_ptr[iidx2]; - - T ac = a*c; - T bd = b*d; - - out_ptr[ridx2] = ac - bd; - out_ptr[iidx2] = (a+b) * (c+d) - ac - bd; - } - } - } - } - } -} - -template -void reorderOutput(To* out_ptr, const af::dim4& od, const af::dim4& os, - const Ti* in_ptr, const af::dim4& id, const af::dim4& is, - const af::dim4& fd, const int half_di0, const int baseDim, - const int fftScale, const bool expand) -{ - for (int d3 = 0; d3 < (int)od[3]; d3++) { - for (int d2 = 0; d2 < (int)od[2]; d2++) { - for (int d1 = 0; d1 < (int)od[1]; d1++) { - for (int d0 = 0; d0 < (int)od[0]; d0++) { - int id0, id1, id2, id3; - if (expand) { - id0 = d0; - id1 = d1 * is[1]; - id2 = d2 * is[2]; - id3 = d3 * is[3]; - } - else { - id0 = d0 + fd[0]/2; - id1 = (d1 + (baseDim > 1)*(fd[1]/2)) * is[1]; - id2 = (d2 + (baseDim > 2)*(fd[2]/2)) * is[2]; - id3 = d3 * is[3]; - } - - int oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0; - - // Divide output elements to cuFFT resulting scale, round result if output - // type is single or double precision floating-point - if (id0 < half_di0) { - // Copy top elements - int iidx = id3 + id2 + id1 + id0 * 2; - if (roundOut) - out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale)); - else - out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale); - } - else if (id0 < half_di0 + (int)fd[0] - 1) { - // Add signal and filter elements to central part - int iidx1 = id3 + id2 + id1 + id0 * 2; - int iidx2 = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1; - if (roundOut) - out_ptr[oidx] = (To)roundf((float)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale)); - else - out_ptr[oidx] = (To)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale); - } - else { - // Copy bottom elements - const int iidx = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1; - if (roundOut) - out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale)); - else - out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale); - } - } - } - } - } -} - template Array fftconvolve(Array const& signal, Array const& filter, const bool expand, ConvolveBatchKind kind) { + signal.eval(); + filter.eval(); + const af::dim4 sd = signal.dims(); const af::dim4 fd = filter.dims(); @@ -249,9 +61,6 @@ Array fftconvolve(Array const& signal, Array const& filter, packed_dims[baseDim] = (sbatch + fbatch); Array packed = createEmptyArray(packed_dims); - convT *packed_ptr = packed.get(); - - const af::dim4 packed_strides = packed.strides(); sig_tmp_dims[0] = filter_tmp_dims[0] = packed_dims[0]; sig_tmp_strides[0] = filter_tmp_strides[0] = 1; @@ -270,107 +79,117 @@ Array fftconvolve(Array const& signal, Array const& filter, filter_tmp_strides[k] = filter_tmp_strides[k - 1] * filter_tmp_dims[k - 1]; } - // Calculate memory offsets for packed signal and filter - convT *sig_tmp_ptr = packed_ptr; - convT *filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3]; - // Number of packed complex elements in dimension 0 dim_t sig_half_d0 = divup(sd[0], 2); // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s - packData(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, signal); + getQueue().enqueue(kernel::packData, packed, sig_tmp_dims, sig_tmp_strides, signal); // Pad filter array with 0s - padArray(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, filter); - - // Compute forward FFT - if (isDouble) { - fftw_plan plan = fftw_plan_many_dft(baseDim, - fft_dims, - packed_dims[baseDim], - (fftw_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - (fftw_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - FFTW_FORWARD, - FFTW_ESTIMATE); - - fftw_execute(plan); - fftw_destroy_plan(plan); - } - else { - fftwf_plan plan = fftwf_plan_many_dft(baseDim, - fft_dims, - packed_dims[baseDim], - (fftwf_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - (fftwf_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - FFTW_FORWARD, - FFTW_ESTIMATE); - - fftwf_execute(plan); - fftwf_destroy_plan(plan); - } + const dim_t offset = sig_tmp_strides[3]*sig_tmp_dims[3]; + getQueue().enqueue(kernel::padArray, packed, filter_tmp_dims, filter_tmp_strides, + filter, offset); + + dim4 fftDims(1, 1, 1, 1); + for (int i=0; i packed, const dim4 fftDims) { + int fft_dims[baseDim]; + for (int i=0; i(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - kind); - else - complexMultiply(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - kind); - - // Compute inverse FFT - if (isDouble) { - fftw_plan plan = fftw_plan_many_dft(baseDim, - fft_dims, - packed_dims[baseDim], - (fftw_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - (fftw_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - FFTW_BACKWARD, - FFTW_ESTIMATE); - - fftw_execute(plan); - fftw_destroy_plan(plan); - } - else { - fftwf_plan plan = fftwf_plan_many_dft(baseDim, - fft_dims, - packed_dims[baseDim], - (fftwf_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - (fftwf_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - FFTW_BACKWARD, - FFTW_ESTIMATE); - - fftwf_execute(plan); - fftwf_destroy_plan(plan); - } + getQueue().enqueue(kernel::complexMultiply, packed, + sig_tmp_dims, sig_tmp_strides, + filter_tmp_dims, filter_tmp_strides, + kind, offset); + + auto upstream_idft = [=] (Array packed, const dim4 fftDims) { + int fft_dims[baseDim]; + for (int i=0; i fftconvolve(Array const& signal, Array const& filter, } Array out = createEmptyArray(oDims); - T* out_ptr = out.get(); - const af::dim4 out_dims = out.dims(); - const af::dim4 out_strides = out.strides(); - const af::dim4 filter_dims = filter.dims(); - - // Reorder the output - if (kind == CONVOLVE_BATCH_KERNEL) { - reorderOutput - (out_ptr, out_dims, out_strides, - filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - filter_dims, sig_half_d0, baseDim, fftScale, expand); - } - else { - reorderOutput - (out_ptr, out_dims, out_strides, - sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - filter_dims, sig_half_d0, baseDim, fftScale, expand); - } + getQueue().enqueue(kernel::reorder, out, packed, filter, + sig_half_d0, fftScale, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims, + filter_tmp_strides, expand, kind); return out; } diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp index 8ab2fe46fc..aa417f49e1 100644 --- a/src/backend/cpu/gradient.cpp +++ b/src/backend/cpu/gradient.cpp @@ -12,83 +12,29 @@ #include #include #include +#include +#include +#include namespace cpu { - template - void gradient(Array &grad0, Array &grad1, const Array &in) - { - const af::dim4 dims = in.dims(); - T *d_grad0 = grad0.get(); - T *d_grad1 = grad1.get(); - const T *d_in = in.get(); - - const af::dim4 inst = in.strides(); - const af::dim4 g0st = grad0.strides(); - const af::dim4 g1st = grad1.strides(); - - T v5 = scalar(0.5); - T v1 = scalar(1.0); - - for(dim_t idw = 0; idw < dims[3]; idw++) { - const dim_t inW = idw * inst[3]; - const dim_t g0W = idw * g0st[3]; - const dim_t g1W = idw * g1st[3]; - for(dim_t idz = 0; idz < dims[2]; idz++) { - const dim_t inZW = inW + idz * inst[2]; - const dim_t g0ZW = g0W + idz * g0st[2]; - const dim_t g1ZW = g1W + idz * g1st[2]; - dim_t xl, xr, yl,yr; - T f0, f1; - for(dim_t idy = 0; idy < dims[1]; idy++) { - const dim_t inYZW = inZW + idy * inst[1]; - const dim_t g0YZW = g0ZW + idy * g0st[1]; - const dim_t g1YZW = g1ZW + idy * g1st[1]; - if(idy == 0) { - yl = inYZW + inst[1]; - yr = inYZW; - f1 = v1; - } else if(idy == dims[1] - 1) { - yl = inYZW; - yr = inYZW - inst[1]; - f1 = v1; - } else { - yl = inYZW + inst[1]; - yr = inYZW - inst[1]; - f1 = v5; - } - for(dim_t idx = 0; idx < dims[0]; idx++) { - const dim_t inMem = inYZW + idx; - const dim_t g0Mem = g0YZW + idx; - const dim_t g1Mem = g1YZW + idx; - if(idx == 0) { - xl = inMem + 1; - xr = inMem; - f0 = v1; - } else if(idx == dims[0] - 1) { - xl = inMem; - xr = inMem - 1; - f0 = v1; - } else { - xl = inMem + 1; - xr = inMem - 1; - f0 = v5; - } +template +void gradient(Array &grad0, Array &grad1, const Array &in) +{ + grad0.eval(); + grad1.eval(); + in.eval(); - d_grad0[g0Mem] = f0 * (d_in[xl] - d_in[xr]); - d_grad1[g1Mem] = f1 * (d_in[yl + idx] - d_in[yr + idx]); - } - } - } - } - } + getQueue().enqueue(kernel::gradient, grad0, grad1, in); +} #define INSTANTIATE(T) \ template void gradient(Array &grad0, Array &grad1, const Array &in); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) + } diff --git a/src/backend/cpu/harris.cpp b/src/backend/cpu/harris.cpp index d16c56a8b2..b5ea0ca20e 100644 --- a/src/backend/cpu/harris.cpp +++ b/src/backend/cpu/harris.cpp @@ -12,142 +12,28 @@ #include #include #include -#include #include #include #include #include #include #include +#include +#include +#include using af::dim4; namespace cpu { -template -void gaussian1D(T* out, const int dim, double sigma=0.0) -{ - if(!(sigma>0)) sigma = 0.25*dim; - - T sum = (T)0; - for(int i=0;i -void second_order_deriv( - T* ixx_out, - T* ixy_out, - T* iyy_out, - const unsigned in_len, - const T* ix_in, - const T* iy_in) -{ - for (unsigned x = 0; x < in_len; x++) { - ixx_out[x] = ix_in[x] * ix_in[x]; - ixy_out[x] = ix_in[x] * iy_in[x]; - iyy_out[x] = iy_in[x] * iy_in[x]; - } -} - -template -void harris_responses( - T* resp_out, - const unsigned idim0, - const unsigned idim1, - const T* ixx_in, - const T* ixy_in, - const T* iyy_in, - const float k_thr, - const unsigned border_len) -{ - const unsigned r = border_len; - - for (unsigned x = r; x < idim1 - r; x++) { - for (unsigned y = r; y < idim0 - r; y++) { - const unsigned idx = x * idim0 + y; - - // Calculates matrix trace and determinant - T tr = ixx_in[idx] + iyy_in[idx]; - T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx]; - - // Calculates local Harris response - resp_out[idx] = det - k_thr * (tr*tr); - } - } -} - -template -void non_maximal( - float* x_out, - float* y_out, - float* resp_out, - unsigned* count, - const unsigned idim0, - const unsigned idim1, - const T* resp_in, - const float min_resp, - const unsigned border_len, - const unsigned max_corners) -{ - // Responses on the border don't have 8-neighbors to compare, discard them - const unsigned r = border_len + 1; - - for (unsigned x = r; x < idim1 - r; x++) { - for (unsigned y = r; y < idim0 - r; y++) { - const T v = resp_in[x * idim0 + y]; - - // Find maximum neighborhood response - T max_v; - max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]); - max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]); - max_v = max(max_v, resp_in[(x-1) * idim0 + y ]); - max_v = max(max_v, resp_in[(x+1) * idim0 + y ]); - max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]); - max_v = max(max_v, resp_in[(x) * idim0 + y+1]); - max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]); - - // Stores corner to {x,y,resp}_out if it's response is maximum compared - // to its 8-neighborhood and greater or equal minimum response - if (v > max_v && v >= (T)min_resp) { - const unsigned idx = *count; - *count += 1; - if (idx < max_corners) { - x_out[idx] = (float)x; - y_out[idx] = (float)y; - resp_out[idx] = (float)v; - } - } - } - } -} - -static void keep_corners(float* x_out, float* y_out, float* resp_out, - const float* x_in, const float* y_in, const float* resp_in, - const unsigned* resp_idx, const unsigned n_corners) -{ - // Keep only the first n_feat features - for (unsigned f = 0; f < n_corners; f++) { - x_out[f] = x_in[resp_idx[f]]; - y_out[f] = y_in[resp_idx[f]]; - resp_out[f] = resp_in[f]; - } -} - template unsigned harris(Array &x_out, Array &y_out, Array &resp_out, const Array &in, const unsigned max_corners, const float min_response, const float sigma, const unsigned filter_len, const float k_thr) { + in.eval(); + dim4 idims = in.dims(); // Window filter @@ -156,8 +42,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out if (sigma < 0.5f) { for (unsigned i = 0; i < filter_len; i++) h_filter[i] = (T)1.f / (filter_len); - } - else { + } else { gaussian1D(h_filter, (int)filter_len, sigma); } Array filter = createDeviceDataArray(dim4(filter_len), (const void*)h_filter); @@ -168,15 +53,14 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out Array iy = createEmptyArray(idims); // Compute first order derivatives - gradient(iy, ix, in); + getQueue().enqueue(gradient, iy, ix, in); Array ixx = createEmptyArray(idims); Array ixy = createEmptyArray(idims); Array iyy = createEmptyArray(idims); // Compute second-order derivatives - second_order_deriv(ixx.get(), ixy.get(), iyy.get(), - in.elements(), ix.get(), iy.get()); + getQueue().enqueue(kernel::second_order_deriv, ixx, ixy, iyy, in.elements(), ix, iy); // Convolve second-order derivatives with proper window filter ixx = convolve2(ixx, filter, filter); @@ -185,26 +69,22 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out const unsigned corner_lim = in.elements() * 0.2f; - float* x_corners = memAlloc(corner_lim); - float* y_corners = memAlloc(corner_lim); - float* resp_corners = memAlloc(corner_lim); + Array responses = createEmptyArray(dim4(in.elements())); - T* resp = memAlloc(in.elements()); + getQueue().enqueue(kernel::harris_responses, responses, idims[0], idims[1], + ixx, ixy, iyy, k_thr, border_len); - // Calculate Harris responses for all pixels - harris_responses(resp, - idims[0], idims[1], - ixx.get(), ixy.get(), iyy.get(), - k_thr, border_len); + Array xCorners = createEmptyArray(dim4(corner_lim)); + Array yCorners = createEmptyArray(dim4(corner_lim)); + Array respCorners = createEmptyArray(dim4(corner_lim)); const unsigned min_r = (max_corners > 0) ? 0.f : min_response; - unsigned corners_found = 0; // Performs non-maximal suppression - non_maximal(x_corners, y_corners, resp_corners, &corners_found, - idims[0], idims[1], resp, min_r, border_len, corner_lim); - - memFree(resp); + getQueue().sync(); + unsigned corners_found = 0; + kernel::non_maximal(xCorners, yCorners, respCorners, &corners_found, + idims[0], idims[1], responses, min_r, border_len, corner_lim); const unsigned corners_out = (max_corners > 0) ? min(corners_found, max_corners) : @@ -213,42 +93,42 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out return 0; if (max_corners > 0 && corners_found > corners_out) { - Array harris_responses = createDeviceDataArray(dim4(corners_found), (void*)resp_corners); + respCorners.resetDims(dim4(corners_found)); Array harris_sorted = createEmptyArray(dim4(corners_found)); Array harris_idx = createEmptyArray(dim4(corners_found)); // Sort Harris responses - sort_index(harris_sorted, harris_idx, harris_responses, 0); + sort_index(harris_sorted, harris_idx, respCorners, 0); x_out = createEmptyArray(dim4(corners_out)); y_out = createEmptyArray(dim4(corners_out)); resp_out = createEmptyArray(dim4(corners_out)); // Keep only the corners with higher Harris responses - keep_corners(x_out.get(), y_out.get(), resp_out.get(), - x_corners, y_corners, harris_sorted.get(), harris_idx.get(), - corners_out); - - memFree(x_corners); - memFree(y_corners); - } - else if (max_corners == 0 && corners_found < corner_lim) { + getQueue().enqueue(kernel::keep_corners, x_out, y_out, resp_out, xCorners, yCorners, + harris_sorted, harris_idx, corners_out); + } else if (max_corners == 0 && corners_found < corner_lim) { x_out = createEmptyArray(dim4(corners_out)); y_out = createEmptyArray(dim4(corners_out)); resp_out = createEmptyArray(dim4(corners_out)); - memcpy(x_out.get(), x_corners, corners_out * sizeof(float)); - memcpy(y_out.get(), y_corners, corners_out * sizeof(float)); - memcpy(resp_out.get(), resp_corners, corners_out * sizeof(float)); - - memFree(x_corners); - memFree(y_corners); - memFree(resp_corners); - } - else { - x_out = createDeviceDataArray(dim4(corners_out), (void*)x_corners); - y_out = createDeviceDataArray(dim4(corners_out), (void*)y_corners); - resp_out = createDeviceDataArray(dim4(corners_out), (void*)resp_corners); + auto copyFunc = [=](Array x_out, Array y_out, + Array outResponses, const Array x_crnrs, + const Array y_crnrs, const Array inResponses, + const unsigned corners_out) { + memcpy(x_out.get(), x_crnrs.get(), corners_out * sizeof(float)); + memcpy(y_out.get(), y_crnrs.get(), corners_out * sizeof(float)); + memcpy(outResponses.get(), inResponses.get(), corners_out * sizeof(float)); + }; + getQueue().enqueue(copyFunc, x_out, y_out, resp_out, + xCorners, yCorners, respCorners, corners_out); + } else { + x_out = xCorners; + y_out = yCorners; + resp_out = respCorners; + x_out.resetDims(dim4(corners_out)); + y_out.resetDims(dim4(corners_out)); + resp_out.resetDims(dim4(corners_out)); } return corners_out; diff --git a/src/backend/cpu/hist_graphics.cpp b/src/backend/cpu/hist_graphics.cpp index 21d3fdf941..ad7d69067d 100644 --- a/src/backend/cpu/hist_graphics.cpp +++ b/src/backend/cpu/hist_graphics.cpp @@ -11,6 +11,8 @@ #include #include +#include +#include namespace cpu { @@ -18,6 +20,8 @@ namespace cpu template void copy_histogram(const Array &data, const fg::Histogram* hist) { + data.eval(); + getQueue().sync(); CheckGL("Begin copy_histogram"); glBindBuffer(GL_ARRAY_BUFFER, hist->vbo()); diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp index e382a0ee87..3c30402b47 100644 --- a/src/backend/cpu/histogram.cpp +++ b/src/backend/cpu/histogram.cpp @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include using af::dim4; @@ -19,33 +22,19 @@ namespace cpu { template -Array histogram(const Array &in, const unsigned &nbins, const double &minval, const double &maxval) +Array histogram(const Array &in, + const unsigned &nbins, + const double &minval, const double &maxval) { - float step = (maxval - minval)/(float)nbins; + in.eval(); const dim4 inDims = in.dims(); - dim4 iStrides = in.strides(); dim4 outDims = dim4(nbins,1,inDims[2],inDims[3]); Array out = createValueArray(outDims, outType(0)); - dim4 oStrides = out.strides(); - dim_t nElems = inDims[0]*inDims[1]; + out.eval(); - outType *outData = out.get(); - const inType* inData= in.get(); - - for(dim_t b3 = 0; b3 < outDims[3]; b3++) { - for(dim_t b2 = 0; b2 < outDims[2]; b2++) { - for(dim_t i=0; i, + out, in, nbins, minval, maxval); return out; } diff --git a/src/backend/cpu/homography.cpp b/src/backend/cpu/homography.cpp index d20f0ca00c..4d131cf695 100644 --- a/src/backend/cpu/homography.cpp +++ b/src/backend/cpu/homography.cpp @@ -15,13 +15,11 @@ #include #include #include -#include #include -#include -#include #include - #include +#include +#include using af::dim4; @@ -154,12 +152,9 @@ unsigned updateIterations(float inlier_ratio, unsigned iter) } template -int computeHomography(T* H_ptr, - const float* rnd_ptr, - const float* x_src_ptr, - const float* y_src_ptr, - const float* x_dst_ptr, - const float* y_dst_ptr) +int computeHomography(T* H_ptr, const float* rnd_ptr, + const float* x_src_ptr, const float* y_src_ptr, + const float* x_dst_ptr, const float* y_dst_ptr) { if ((unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[1] || (unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[2] || (unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[3] || (unsigned)rnd_ptr[1] == (unsigned)rnd_ptr[2] || @@ -192,6 +187,8 @@ int computeHomography(T* H_ptr, float dst_scale = sqrt(2.0f) / sqrt(dst_var); Array A = createValueArray(af::dim4(9, 9), (T)0); + A.eval(); + getQueue().sync(); af::dim4 Adims = A.dims(); T* A_ptr = A.get(); @@ -217,6 +214,8 @@ int computeHomography(T* H_ptr, } Array V = createValueArray(af::dim4(Adims[1], Adims[1]), (T)0); + V.eval(); + getQueue().sync(); JacobiSVD(A.get(), V.get(), 9, 9); af::dim4 Vdims = V.dims(); @@ -262,6 +261,8 @@ int findBestHomography(Array &bestH, const float* y_dst_ptr = y_dst.get(); Array H = createValueArray(af::dim4(9, iterations), (T)0); + H.eval(); + getQueue().sync(); const af::dim4 rdims = rnd.dims(); const af::dim4 Hdims = H.dims(); @@ -278,8 +279,7 @@ int findBestHomography(Array &bestH, const unsigned ridx = rdims[0] * i; const float* rnd_ptr = rnd.get() + ridx; - if (computeHomography(H_ptr, rnd_ptr, x_src_ptr, y_src_ptr, - x_dst_ptr, y_dst_ptr)) + if (computeHomography(H_ptr, rnd_ptr, x_src_ptr, y_src_ptr, x_dst_ptr, y_dst_ptr)) continue; if (htype == AF_HOMOGRAPHY_RANSAC) { @@ -320,7 +320,6 @@ int findBestHomography(Array &bestH, minMedian = median; bestIdx = i; } - } } @@ -355,6 +354,11 @@ int homography(Array &bestH, const float inlier_thr, const unsigned iterations) { + x_src.eval(); + y_src.eval(); + x_dst.eval(); + y_dst.eval(); + const af::dim4 idims = x_src.dims(); const unsigned nsamples = idims[0]; @@ -366,6 +370,8 @@ int homography(Array &bestH, Array frnd = randu(rdims); Array fctr = createValueArray(rdims, (float)nsamples); Array rnd = arithOp(frnd, fctr, rdims); + rnd.eval(); + getQueue().sync(); return findBestHomography(bestH, x_src, y_src, x_dst, y_dst, rnd, iter, nsamples, inlier_thr, htype); } diff --git a/src/backend/cpu/hsv_rgb.cpp b/src/backend/cpu/hsv_rgb.cpp index 82f404fa95..404491766c 100644 --- a/src/backend/cpu/hsv_rgb.cpp +++ b/src/backend/cpu/hsv_rgb.cpp @@ -11,8 +11,9 @@ #include #include #include -#include -#include +#include +#include +#include using af::dim4; @@ -22,54 +23,11 @@ namespace cpu template Array hsv2rgb(const Array& in) { - const dim4 dims = in.dims(); - const dim4 strides = in.strides(); - Array out = createEmptyArray(dims); - dim_t obStride = out.strides()[3]; - dim_t coff = strides[2]; - dim_t bCount = dims[3]; + in.eval(); - for(dim_t b=0; b out = createEmptyArray(in.dims()); - for(dim_t j=0; j, out, in); return out; } @@ -77,53 +35,11 @@ Array hsv2rgb(const Array& in) template Array rgb2hsv(const Array& in) { - const dim4 dims = in.dims(); - const dim4 strides = in.strides(); - Array out = createEmptyArray(dims); - dim4 oStrides = out.strides(); - dim_t bCount = dims[3]; - - for(dim_t b=0; b out = createEmptyArray(in.dims()); - dst[oIdx0] = H; - dst[oIdx1] = (Cmax==0.0f ? 0 : delta/Cmax); - dst[oIdx2] = Cmax; - } - } - } + getQueue().enqueue(kernel::rgb2hsv, out, in); return out; } diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp index 2973ae4409..c5e11029fc 100644 --- a/src/backend/cpu/identity.cpp +++ b/src/backend/cpu/identity.cpp @@ -7,47 +7,40 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include #include -#include #include #include -#include +#include +#include +#include namespace cpu { - template - Array identity(const dim4& dims) - { - Array out = createEmptyArray(dims); - T *ptr = out.get(); - const dim_t *out_dims = out.dims().get(); - - for (dim_t k = 0; k < out_dims[2] * out_dims[3]; k++) { - for (dim_t j = 0; j < out_dims[1]; j++) { - for (dim_t i = 0; i < out_dims[0]; i++) { - ptr[j * out_dims[0] + i] = (i == j) ? scalar(1) : scalar(0); - } - } - ptr += out_dims[0] * out_dims[1]; - } - return out; - } + +template +Array identity(const dim4& dims) +{ + Array out = createEmptyArray(dims); + + getQueue().enqueue(kernel::identity, out); + + return out; +} #define INSTANTIATE_IDENTITY(T) \ template Array identity (const af::dim4 &dims); - INSTANTIATE_IDENTITY(float) - INSTANTIATE_IDENTITY(double) - INSTANTIATE_IDENTITY(cfloat) - INSTANTIATE_IDENTITY(cdouble) - INSTANTIATE_IDENTITY(int) - INSTANTIATE_IDENTITY(uint) - INSTANTIATE_IDENTITY(intl) - INSTANTIATE_IDENTITY(uintl) - INSTANTIATE_IDENTITY(char) - INSTANTIATE_IDENTITY(uchar) - INSTANTIATE_IDENTITY(short) - INSTANTIATE_IDENTITY(ushort) +INSTANTIATE_IDENTITY(float) +INSTANTIATE_IDENTITY(double) +INSTANTIATE_IDENTITY(cfloat) +INSTANTIATE_IDENTITY(cdouble) +INSTANTIATE_IDENTITY(int) +INSTANTIATE_IDENTITY(uint) +INSTANTIATE_IDENTITY(intl) +INSTANTIATE_IDENTITY(uintl) +INSTANTIATE_IDENTITY(char) +INSTANTIATE_IDENTITY(uchar) +INSTANTIATE_IDENTITY(short) +INSTANTIATE_IDENTITY(ushort) } diff --git a/src/backend/cpu/iir.cpp b/src/backend/cpu/iir.cpp index 615da2238d..049212ad69 100644 --- a/src/backend/cpu/iir.cpp +++ b/src/backend/cpu/iir.cpp @@ -12,81 +12,49 @@ #include #include #include -#include -#include -#include #include +#include +#include +#include using af::dim4; namespace cpu { - template - Array iir(const Array &b, const Array &a, const Array &x) - { - T h_a0 = a.get()[0]; - Array a0 = createValueArray(b.dims(), h_a0); - ConvolveBatchKind type = x.ndims() == 1 ? CONVOLVE_BATCH_NONE : CONVOLVE_BATCH_SAME; - if (x.ndims() != b.ndims()) { - type = (x.ndims() < b.ndims()) ? CONVOLVE_BATCH_KERNEL : CONVOLVE_BATCH_SIGNAL; - } - - // Extract the first N elements - Array c = convolve(x, b, type); - dim4 cdims = c.dims(); - cdims[0] = x.dims()[0]; - c.resetDims(cdims); - - int num_a = a.dims()[0]; - - dim4 ydims = c.dims(); - Array y = createEmptyArray(ydims); - - for (int l = 0; l < (int)ydims[3]; l++) { - dim_t yidx3 = l * y.strides()[3]; - dim_t cidx3 = l * c.strides()[3]; - dim_t aidx3 = l * a.strides()[3]; - - for (int k = 0; k < (int)ydims[2]; k++) { - - dim_t yidx2 = k * y.strides()[2] + yidx3; - dim_t cidx2 = k * c.strides()[2] + cidx3; - dim_t aidx2 = k * a.strides()[2] + aidx3; - - for (int j = 0; j < (int)ydims[1]; j++) { - - dim_t yidx1 = j * y.strides()[1] + yidx2; - dim_t cidx1 = j * c.strides()[1] + cidx2; - dim_t aidx1 = j * a.strides()[1] + aidx2; +template +Array iir(const Array &b, const Array &a, const Array &x) +{ + b.eval(); + a.eval(); + x.eval(); - std::vector h_z(num_a); + ConvolveBatchKind type = x.ndims() == 1 ? CONVOLVE_BATCH_NONE : CONVOLVE_BATCH_SAME; + if (x.ndims() != b.ndims()) { + type = (x.ndims() < b.ndims()) ? CONVOLVE_BATCH_KERNEL : CONVOLVE_BATCH_SIGNAL; + } - const T *h_a = a.get() + (a.ndims() > 1 ? aidx1 : 0); - T *h_c = c.get() + cidx1; - T *h_y = y.get() + yidx1; + // Extract the first N elements + Array c = convolve(x, b, type); + dim4 cdims = c.dims(); + cdims[0] = x.dims()[0]; + c.resetDims(cdims); - for (int i = 0; i < (int)ydims[0]; i++) { + Array y = createEmptyArray(c.dims()); - T y = h_y[i] = (h_c[i] + h_z[0]) / h_a[0]; - for (int ii = 1; ii < num_a; ii++) { - h_z[ii - 1] = h_z[ii] - h_a[ii] * y; - } - } - } - } - } + getQueue().enqueue(kernel::iir, y, c, a); - return y; - } + return y; +} #define INSTANTIATE(T) \ template Array iir(const Array &b, \ const Array &a, \ const Array &x); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) + } diff --git a/src/backend/cpu/image.cpp b/src/backend/cpu/image.cpp index 947afa2351..b71ba23c12 100644 --- a/src/backend/cpu/image.cpp +++ b/src/backend/cpu/image.cpp @@ -15,39 +15,43 @@ #include #include #include -#include -#include #include +#include +#include using af::dim4; namespace cpu { - template - void copy_image(const Array &in, const fg::Image* image) - { - CheckGL("Before CopyArrayToPBO"); - const T *d_X = in.get(); - size_t data_size = image->size(); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, image->pbo()); - glBufferSubData(GL_PIXEL_UNPACK_BUFFER, 0, data_size, d_X); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - CheckGL("In CopyArrayToPBO"); - } - - #define INSTANTIATE(T) \ - template void copy_image(const Array &in, const fg::Image* image); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(ushort) - INSTANTIATE(short) + +template +void copy_image(const Array &in, const fg::Image* image) +{ + in.eval(); + getQueue().sync(); + CheckGL("Before CopyArrayToPBO"); + const T *d_X = in.get(); + size_t data_size = image->size(); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, image->pbo()); + glBufferSubData(GL_PIXEL_UNPACK_BUFFER, 0, data_size, d_X); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + CheckGL("In CopyArrayToPBO"); +} + +#define INSTANTIATE(T) \ + template void copy_image(const Array &in, const fg::Image* image); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(ushort) +INSTANTIATE(short) + } #endif // WITH_GRAPHICS diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp index e6d3daba4e..a2cdac888f 100644 --- a/src/backend/cpu/index.cpp +++ b/src/backend/cpu/index.cpp @@ -13,98 +13,52 @@ #include #include #include -#include #include +#include +#include +#include +#include +using std::vector; using af::dim4; namespace cpu { -static inline -dim_t trimIndex(dim_t idx, const dim_t &len) -{ - dim_t ret_val = idx; - dim_t offset = abs(ret_val)%len; - if (ret_val<0) { - ret_val = offset-1; - } else if (ret_val>=len) { - ret_val = len-offset-1; - } - return ret_val; -} - template Array index(const Array& in, const af_index_t idxrs[]) { - bool isSeq[4]; - std::vector seqs(4, af_span); + in.eval(); + + vector isSeq(4); + vector seqs(4, af_span); // create seq vector to retrieve output // dimensions, offsets & offsets - for (dim_t x=0; x<4; ++x) { + for (unsigned x=0; x > idxArrs(4, createEmptyArray(dim4())); + vector< Array > idxArrs(4, createEmptyArray(dim4())); // look through indexs to read af_array indexs - for (dim_t x=0; x<4; ++x) { + for (unsigned x=0; x(idxrs[x].idx.arr); + idxArrs[x].eval(); // set output array ith dimension value oDims[x] = idxArrs[x].elements(); } } Array out = createEmptyArray(oDims); - dim4 oStrides= out.strides(); - - const T *src = in.get(); - T *dst = out.get(); - - const uint* ptr0 = idxArrs[0].get(); - const uint* ptr1 = idxArrs[1].get(); - const uint* ptr2 = idxArrs[2].get(); - const uint* ptr3 = idxArrs[3].get(); - - for (dim_t l=0; l, out, in, std::move(isSeq), std::move(seqs), std::move(idxArrs)); return out; } diff --git a/src/backend/cpu/inverse.cpp b/src/backend/cpu/inverse.cpp index 129823b963..ea7d7ee828 100644 --- a/src/backend/cpu/inverse.cpp +++ b/src/backend/cpu/inverse.cpp @@ -23,6 +23,8 @@ #include #include #include +#include +#include namespace cpu { @@ -48,6 +50,7 @@ INV_FUNC(getri , cdouble, z) template Array inverse(const Array &in) { + in.eval(); int M = in.dims()[0]; int N = in.dims()[1]; @@ -58,12 +61,14 @@ Array inverse(const Array &in) } Array A = copyArray(in); - Array pivot = lu_inplace(A, false); - getri_func()(AF_LAPACK_COL_MAJOR, M, - A.get(), A.strides()[1], - pivot.get()); + auto func = [=] (Array A, Array pivot, int M) { + getri_func()(AF_LAPACK_COL_MAJOR, M, + A.get(), A.strides()[1], + pivot.get()); + }; + getQueue().enqueue(func, A, pivot, M); return A; } diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp index 47bcb924e4..db19708b46 100644 --- a/src/backend/cpu/iota.cpp +++ b/src/backend/cpu/iota.cpp @@ -10,63 +10,38 @@ #include #include #include -#include -#include -#include -#include +#include +#include +#include using namespace std; namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - template - void iota(T *out, const dim4 &dims, const dim4 &strides, const dim4 &sdims, const dim4 &tdims) - { - for(dim_t w = 0; w < dims[3]; w++) { - dim_t offW = w * strides[3]; - T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2]; - for(dim_t z = 0; z < dims[2]; z++) { - dim_t offWZ = offW + z * strides[2]; - T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1]; - for(dim_t y = 0; y < dims[1]; y++) { - dim_t offWZY = offWZ + y * strides[1]; - T valY = valZ + (y % sdims[1]) * sdims[0]; - for(dim_t x = 0; x < dims[0]; x++) { - dim_t id = offWZY + x; - out[id] = valY + (x % sdims[0]); - } - } - } - } - } - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - Array iota(const dim4 &dims, const dim4 &tile_dims) - { - dim4 outdims = dims * tile_dims; +template +Array iota(const dim4 &dims, const dim4 &tile_dims) +{ + dim4 outdims = dims * tile_dims; + + Array out = createEmptyArray(outdims); - Array out = createEmptyArray(outdims); - iota(out.get(), out.dims(), out.strides(), dims, tile_dims); + getQueue().enqueue(kernel::iota, out, dims, tile_dims); - return out; - } + return out; +} #define INSTANTIATE(T) \ template Array iota(const af::dim4 &dims, const af::dim4 &tile_dims); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp index 2928af9620..a40fbdf958 100644 --- a/src/backend/cpu/ireduce.cpp +++ b/src/backend/cpu/ireduce.cpp @@ -13,192 +13,100 @@ #include #include #include +#include +#include +#include using af::dim4; namespace cpu { - template double cabs(const T in) { return (double)in; } - static double cabs(const char in) { return (double)(in > 0); } - static double cabs(const cfloat &in) { return (double)abs(in); } - static double cabs(const cdouble &in) { return (double)abs(in); } - - template - struct MinMaxOp - { - T m_val; - uint m_idx; - MinMaxOp(T val, uint idx) : - m_val(val), m_idx(idx) - { - } - void operator()(T val, uint idx) - { - if (cabs(val) < cabs(m_val) || - (cabs(val) == cabs(m_val) && - idx > m_idx)) { - m_val = val; - m_idx = idx; - } - } - }; - - template - struct MinMaxOp - { - T m_val; - uint m_idx; - MinMaxOp(T val, uint idx) : - m_val(val), m_idx(idx) - { - } +template +using ireduce_dim_func = std::function, Array, const dim_t, + const Array, const dim_t, const int)>; - void operator()(T val, uint idx) - { - if (cabs(val) > cabs(m_val) || - (cabs(val) == cabs(m_val) && - idx <= m_idx)) { - m_val = val; - m_idx = idx; - } - } - }; - - template - struct ireduce_dim - { - void operator()(T *out, const dim4 ostrides, const dim4 odims, - uint *loc, - const T *in , const dim4 istrides, const dim4 idims, - const int dim) - { - const int D1 = D - 1; - for (dim_t i = 0; i < odims[D1]; i++) { - ireduce_dim()(out + i * ostrides[D1], - ostrides, odims, - loc + i * ostrides[D1], - in + i * istrides[D1], - istrides, idims, - dim); - } - } - }; - - template - struct ireduce_dim - { - void operator()(T *out, const dim4 ostrides, const dim4 odims, - uint *loc, - const T *in , const dim4 istrides, const dim4 idims, - const int dim) - { - - dim_t stride = istrides[dim]; - MinMaxOp Op(in[0], 0); - for (dim_t i = 0; i < idims[dim]; i++) { - Op(in[i * stride], i); - } +template +void ireduce(Array &out, Array &loc, const Array &in, const int dim) +{ + out.eval(); + loc.eval(); + in.eval(); + + dim4 odims = in.dims(); + odims[dim] = 1; + static const ireduce_dim_func ireduce_funcs[] = { kernel::ireduce_dim() + , kernel::ireduce_dim() + , kernel::ireduce_dim() + , kernel::ireduce_dim()}; + + getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); +} - *out = Op.m_val; - *loc = Op.m_idx; - } - }; - - template - void ireduce(Array &out, Array &loc, - const Array &in, const int dim) - { - dim4 odims = in.dims(); - odims[dim] = 1; - - switch (in.ndims()) { - case 1: - ireduce_dim()(out.get(), out.strides(), out.dims(), - loc.get(), - in.get(), in.strides(), in.dims(), dim); - break; - - case 2: - ireduce_dim()(out.get(), out.strides(), out.dims(), - loc.get(), - in.get(), in.strides(), in.dims(), dim); - break; - - case 3: - ireduce_dim()(out.get(), out.strides(), out.dims(), - loc.get(), - in.get(), in.strides(), in.dims(), dim); - break; - - case 4: - ireduce_dim()(out.get(), out.strides(), out.dims(), - loc.get(), - in.get(), in.strides(), in.dims(), dim); - break; - } - } +template +T ireduce_all(unsigned *loc, const Array &in) +{ + in.eval(); + getQueue().sync(); - template - T ireduce_all(unsigned *loc, const Array &in) - { - af::dim4 dims = in.dims(); - af::dim4 strides = in.strides(); - const T *inPtr = in.get(); + af::dim4 dims = in.dims(); + af::dim4 strides = in.strides(); + const T *inPtr = in.get(); - MinMaxOp Op(inPtr[0], 0); + kernel::MinMaxOp Op(inPtr[0], 0); - for(dim_t l = 0; l < dims[3]; l++) { - dim_t off3 = l * strides[3]; + for(dim_t l = 0; l < dims[3]; l++) { + dim_t off3 = l * strides[3]; - for(dim_t k = 0; k < dims[2]; k++) { - dim_t off2 = k * strides[2]; + for(dim_t k = 0; k < dims[2]; k++) { + dim_t off2 = k * strides[2]; - for(dim_t j = 0; j < dims[1]; j++) { - dim_t off1 = j * strides[1]; + for(dim_t j = 0; j < dims[1]; j++) { + dim_t off1 = j * strides[1]; - for(dim_t i = 0; i < dims[0]; i++) { - dim_t idx = i + off1 + off2 + off3; - Op(inPtr[idx], idx); - } + for(dim_t i = 0; i < dims[0]; i++) { + dim_t idx = i + off1 + off2 + off3; + Op(inPtr[idx], idx); } } } - - *loc = Op.m_idx; - return Op.m_val; } + *loc = Op.m_idx; + return Op.m_val; +} + #define INSTANTIATE(ROp, T) \ template void ireduce(Array &out, Array &loc, \ const Array &in, const int dim); \ template T ireduce_all(unsigned *loc, const Array &in); \ - //min - INSTANTIATE(af_min_t, float ) - INSTANTIATE(af_min_t, double ) - INSTANTIATE(af_min_t, cfloat ) - INSTANTIATE(af_min_t, cdouble) - INSTANTIATE(af_min_t, int ) - INSTANTIATE(af_min_t, uint ) - INSTANTIATE(af_min_t, intl ) - INSTANTIATE(af_min_t, uintl ) - INSTANTIATE(af_min_t, char ) - INSTANTIATE(af_min_t, uchar ) - INSTANTIATE(af_min_t, short ) - INSTANTIATE(af_min_t, ushort ) - - //max - INSTANTIATE(af_max_t, float ) - INSTANTIATE(af_max_t, double ) - INSTANTIATE(af_max_t, cfloat ) - INSTANTIATE(af_max_t, cdouble) - INSTANTIATE(af_max_t, int ) - INSTANTIATE(af_max_t, uint ) - INSTANTIATE(af_max_t, intl ) - INSTANTIATE(af_max_t, uintl ) - INSTANTIATE(af_max_t, char ) - INSTANTIATE(af_max_t, uchar ) - INSTANTIATE(af_max_t, short ) - INSTANTIATE(af_max_t, ushort ) +//min +INSTANTIATE(af_min_t, float ) +INSTANTIATE(af_min_t, double ) +INSTANTIATE(af_min_t, cfloat ) +INSTANTIATE(af_min_t, cdouble) +INSTANTIATE(af_min_t, int ) +INSTANTIATE(af_min_t, uint ) +INSTANTIATE(af_min_t, intl ) +INSTANTIATE(af_min_t, uintl ) +INSTANTIATE(af_min_t, char ) +INSTANTIATE(af_min_t, uchar ) +INSTANTIATE(af_min_t, short ) +INSTANTIATE(af_min_t, ushort ) + +//max +INSTANTIATE(af_max_t, float ) +INSTANTIATE(af_max_t, double ) +INSTANTIATE(af_max_t, cfloat ) +INSTANTIATE(af_max_t, cdouble) +INSTANTIATE(af_max_t, int ) +INSTANTIATE(af_max_t, uint ) +INSTANTIATE(af_max_t, intl ) +INSTANTIATE(af_max_t, uintl ) +INSTANTIATE(af_max_t, char ) +INSTANTIATE(af_max_t, uchar ) +INSTANTIATE(af_max_t, short ) +INSTANTIATE(af_max_t, ushort ) + } diff --git a/src/backend/cpu/join.cpp b/src/backend/cpu/join.cpp index 78d2a51ab4..0a5b99cd13 100644 --- a/src/backend/cpu/join.cpp +++ b/src/backend/cpu/join.cpp @@ -9,243 +9,136 @@ #include #include -#include -#include +#include +#include +#include namespace cpu { - template - void join_append(To *out, const Tx *X, const af::dim4 &offset, - const af::dim4 &odims, const af::dim4 &xdims, - const af::dim4 &ost, const af::dim4 &xst) - { - for(dim_t ow = 0; ow < xdims[3]; ow++) { - const dim_t xW = ow * xst[3]; - const dim_t oW = (ow + offset[3]) * ost[3]; - - for(dim_t oz = 0; oz < xdims[2]; oz++) { - const dim_t xZW = xW + oz * xst[2]; - const dim_t oZW = oW + (oz + offset[2]) * ost[2]; - - for(dim_t oy = 0; oy < xdims[1]; oy++) { - const dim_t xYZW = xZW + oy * xst[1]; - const dim_t oYZW = oZW + (oy + offset[1]) * ost[1]; - - for(dim_t ox = 0; ox < xdims[0]; ox++) { - const dim_t iMem = xYZW + ox; - const dim_t oMem = oYZW + (ox + offset[0]); - out[oMem] = X[iMem]; - } - } - } + +template +Array join(const int dim, const Array &first, const Array &second) +{ + first.eval(); + second.eval(); + + // All dimensions except join dimension must be equal + // Compute output dims + af::dim4 odims; + af::dim4 fdims = first.dims(); + af::dim4 sdims = second.dims(); + + for(int i = 0; i < 4; i++) { + if(i == dim) { + odims[i] = fdims[i] + sdims[i]; + } else { + odims[i] = fdims[i]; } } - template - af::dim4 calcOffset(const af::dim4 dims) - { - af::dim4 offset; - offset[0] = (dim == 0) ? dims[0] : 0; - offset[1] = (dim == 1) ? dims[1] : 0; - offset[2] = (dim == 2) ? dims[2] : 0; - offset[3] = (dim == 3) ? dims[3] : 0; - return offset; - } + Array out = createEmptyArray(odims); - template - Array join(const int dim, const Array &first, const Array &second) - { - // All dimensions except join dimension must be equal - // Compute output dims - af::dim4 odims; - af::dim4 fdims = first.dims(); - af::dim4 sdims = second.dims(); - - for(int i = 0; i < 4; i++) { - if(i == dim) { - odims[i] = fdims[i] + sdims[i]; - } else { - odims[i] = fdims[i]; - } - } + getQueue().enqueue(kernel::join, out, dim, first, second); - Array out = createEmptyArray(odims); - - Tx* outPtr = out.get(); - const Tx* fptr = first.get(); - const Ty* sptr = second.get(); - - af::dim4 zero(0,0,0,0); - - switch(dim) { - case 0: - join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); - join_append(outPtr, sptr, calcOffset<0>(fdims), - odims, sdims, out.strides(), second.strides()); - break; - case 1: - join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); - join_append(outPtr, sptr, calcOffset<1>(fdims), - odims, sdims, out.strides(), second.strides()); - break; - case 2: - join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); - join_append(outPtr, sptr, calcOffset<2>(fdims), - odims, sdims, out.strides(), second.strides()); - break; - case 3: - join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); - join_append(outPtr, sptr, calcOffset<3>(fdims), - odims, sdims, out.strides(), second.strides()); - break; - } + return out; +} - return out; +template +Array join(const int dim, const std::vector> &inputs) +{ + for (unsigned i=0; i idims(n_arrays); + + dim_t dim_size = 0; + for(unsigned i = 0; i < idims.size(); i++) { + idims[i] = inputs[i].dims(); + dim_size += idims[i][dim]; } - template - void join_wrapper(const int dim, Array &out, const std::vector> &inputs) - { - af::dim4 zero(0,0,0,0); - af::dim4 d = zero; - switch(dim) { - case 0: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<0>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 1: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<1>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 2: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<2>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 3: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<3>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; + for(int i = 0; i < 4; i++) { + if(i == dim) { + odims[i] = dim_size; + } else { + odims[i] = idims[0][i]; } } - template - Array join(const int dim, const std::vector> &inputs) - { - // All dimensions except join dimension must be equal - // Compute output dims - af::dim4 odims; - const dim_t n_arrays = inputs.size(); - std::vector idims(n_arrays); - - dim_t dim_size = 0; - for(int i = 0; i < (int)idims.size(); i++) { - idims[i] = inputs[i].dims(); - dim_size += idims[i][dim]; - } - - for(int i = 0; i < 4; i++) { - if(i == dim) { - odims[i] = dim_size; - } else { - odims[i] = idims[0][i]; - } - } - - Array out = createEmptyArray(odims); - - switch(n_arrays) { - case 1: - join_wrapper(dim, out, inputs); - break; - case 2: - join_wrapper(dim, out, inputs); - break; - case 3: - join_wrapper(dim, out, inputs); - break; - case 4: - join_wrapper(dim, out, inputs); - break; - case 5: - join_wrapper(dim, out, inputs); - break; - case 6: - join_wrapper(dim, out, inputs); - break; - case 7: - join_wrapper(dim, out, inputs); - break; - case 8: - join_wrapper(dim, out, inputs); - break; - case 9: - join_wrapper(dim, out, inputs); - break; - case 10: - join_wrapper(dim, out, inputs); - break; - } - - return out; + Array out = createEmptyArray(odims); + + switch(n_arrays) { + case 1: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 2: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 3: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 4: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 5: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 6: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 7: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 8: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 9: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; + case 10: + getQueue().enqueue(kernel::join, dim, out, inputs); + break; } + return out; +} + #define INSTANTIATE(Tx, Ty) \ template Array join(const int dim, const Array &first, const Array &second); - INSTANTIATE(float, float) - INSTANTIATE(double, double) - INSTANTIATE(cfloat, cfloat) - INSTANTIATE(cdouble, cdouble) - INSTANTIATE(int, int) - INSTANTIATE(uint, uint) - INSTANTIATE(intl, intl) - INSTANTIATE(uintl, uintl) - INSTANTIATE(uchar, uchar) - INSTANTIATE(char, char) - INSTANTIATE(ushort, ushort) - INSTANTIATE(short, short) +INSTANTIATE(float, float) +INSTANTIATE(double, double) +INSTANTIATE(cfloat, cfloat) +INSTANTIATE(cdouble, cdouble) +INSTANTIATE(int, int) +INSTANTIATE(uint, uint) +INSTANTIATE(intl, intl) +INSTANTIATE(uintl, uintl) +INSTANTIATE(uchar, uchar) +INSTANTIATE(char, char) +INSTANTIATE(ushort, ushort) +INSTANTIATE(short, short) #undef INSTANTIATE #define INSTANTIATE(T) \ template Array join(const int dim, const std::vector> &inputs); - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(ushort) - INSTANTIATE(short) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(ushort) +INSTANTIATE(short) #undef INSTANTIATE } diff --git a/src/backend/cpu/kernel/Array.hpp b/src/backend/cpu/kernel/Array.hpp new file mode 100644 index 0000000000..08ade502e5 --- /dev/null +++ b/src/backend/cpu/kernel/Array.hpp @@ -0,0 +1,58 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void evalArray(Array in) +{ + in.setId(cpu::getActiveDeviceId()); + T *ptr = in.data.get(); + + af::dim4 odims = in.dims(); + af::dim4 ostrs = in.strides(); + + bool is_linear = in.node->isLinear(odims.get()); + + if (is_linear) { + int num = in.elements(); + for (int i = 0; i < num; i++) { + ptr[i] = *(T *)in.node->calc(i); + } + } else { + for (int w = 0; w < (int)odims[3]; w++) { + dim_t offw = w * ostrs[3]; + + for (int z = 0; z < (int)odims[2]; z++) { + dim_t offz = z * ostrs[2] + offw; + + for (int y = 0; y < (int)odims[1]; y++) { + dim_t offy = y * ostrs[1] + offz; + + for (int x = 0; x < (int)odims[0]; x++) { + dim_t id = x + offy; + + ptr[id] = *(T *)in.node->calc(x, y, z, w); + } + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/approx1.hpp b/src/backend/cpu/kernel/approx1.hpp new file mode 100644 index 0000000000..ab12ebc813 --- /dev/null +++ b/src/backend/cpu/kernel/approx1.hpp @@ -0,0 +1,144 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +struct approx1_op +{ + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, + af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) + { + return; + } +}; + +template +struct approx1_op +{ + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, + af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) + { + dim_t pmId = idx; + if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; + + LocT const x = pos[pmId]; + bool gFlag = false; + if (x < 0 || idims[0] < x+1) { // No need to check y + gFlag = true; + } + + dim_t const omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + + idy * istrides[1]; + dim_t const iMem = round(x) + ioff; + + out[omId] = in[iMem]; + } + } +}; + +template +struct approx1_op +{ + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, + af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) + { + dim_t pmId = idx; + if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; + + LocT const x = pos[pmId]; + bool gFlag = false; + if (x < 0 || idims[0] < x+1) { + gFlag = true; + } + + dim_t const grid_x = floor(x); // nearest grid + LocT const off_x = x - grid_x; // fractional offset + + dim_t const omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x; + + // Check if x and x + 1 are both valid indices + bool cond = (x < idims[0] - 1); + // Compute Left and Right Weighted Values + InT yl = ((LocT)1.0 - off_x) * in[ioff]; + InT yr = cond ? (off_x) * in[ioff + 1] : scalar(0); + InT yo = yl + yr; + // Compute Weight used + LocT wt = cond ? (LocT)1.0 : (LocT)(1.0 - off_x); + // Write final value + out[omId] = (yo / wt); + } + } +}; + +template +void approx1(Array output, Array const input, + Array const position, float const offGrid) +{ + InT * out = output.get(); + InT const * const in = input.get(); + LocT const * const pos = position.get(); + + af::dim4 const odims = output.dims(); + af::dim4 const idims = input.dims(); + af::dim4 const pdims = position.dims(); + af::dim4 const ostrides = output.strides(); + af::dim4 const istrides = input.strides(); + af::dim4 const pstrides = position.strides(); + + dim_t const oElems = output.elements(); + dim_t const iElems = input.elements(); + + approx1_op op; + bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1); + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(out, odims, oElems, in, idims, iElems, pos, pdims, + ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w); + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/approx2.hpp b/src/backend/cpu/kernel/approx2.hpp new file mode 100644 index 0000000000..b5115e2e49 --- /dev/null +++ b/src/backend/cpu/kernel/approx2.hpp @@ -0,0 +1,170 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +struct approx2_op +{ + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims, + af::dim4 const & ostrides, af::dim4 const & istrides, + af::dim4 const & pstrides, af::dim4 const & qstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) + { + return; + } +}; + +template +struct approx2_op +{ + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims, + af::dim4 const & ostrides, af::dim4 const & istrides, + af::dim4 const & pstrides, af::dim4 const & qstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) + { + dim_t pmId = idy * pstrides[1] + idx; + dim_t qmId = idy * qstrides[1] + idx; + if(pBatch) { + pmId += idw * pstrides[3] + idz * pstrides[2]; + qmId += idw * qstrides[3] + idz * qstrides[2]; + } + + bool gFlag = false; + LocT const x = pos[pmId], y = qos[qmId]; + if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { + gFlag = true; + } + + dim_t const omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t const grid_x = round(x), grid_y = round(y); // nearest grid + dim_t const imId = idw * istrides[3] + idz * istrides[2] + + grid_y * istrides[1] + grid_x; + out[omId] = in[imId]; + } + } +}; + +template +struct approx2_op +{ + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims, + af::dim4 const & ostrides, af::dim4 const & istrides, + af::dim4 const & pstrides, af::dim4 const & qstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) + { + dim_t pmId = idy * pstrides[1] + idx; + dim_t qmId = idy * qstrides[1] + idx; + if(pBatch) { + pmId += idw * pstrides[3] + idz * pstrides[2]; + qmId += idw * qstrides[3] + idz * qstrides[2]; + } + + bool gFlag = false; + LocT const x = pos[pmId], y = qos[qmId]; + if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { + gFlag = true; + } + + dim_t const grid_x = floor(x), grid_y = floor(y); // nearest grid + LocT const off_x = x - grid_x, off_y = y - grid_y; // fractional offset + + // Check if pVal and pVal + 1 are both valid indices + bool condY = (y < idims[1] - 1); + bool condX = (x < idims[0] - 1); + + // Compute wieghts used + LocT wt00 = ((LocT)1.0 - off_x) * ((LocT)1.0 - off_y); + LocT wt10 = (condY) ? ((LocT)1.0 - off_x) * (off_y) : 0; + LocT wt01 = (condX) ? (off_x) * ((LocT)1.0 - off_y) : 0; + LocT wt11 = (condX && condY) ? (off_x) * (off_y) : 0; + + LocT wt = wt00 + wt10 + wt01 + wt11; + InT zero = scalar(0); + + dim_t const omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + + grid_y * istrides[1] + grid_x; + + // Compute Weighted Values + InT y00 = wt00 * in[ioff]; + InT y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero; + InT y01 = (condX) ? wt01 * in[ioff + 1] : zero; + InT y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero; + + InT yo = y00 + y10 + y01 + y11; + + // Write Final Value + out[omId] = (yo / wt); + } + } +}; + +template +void approx2(Array output, Array const input, + Array const position, Array const qosition, + float const offGrid) +{ + InT * out = output.get(); + InT const * const in = input.get(); + LocT const * const pos = position.get(); + LocT const * const qos = qosition.get(); + af::dim4 const odims = output.dims(); + af::dim4 const idims = input.dims(); + af::dim4 const pdims = position.dims(); + af::dim4 const qdims = qosition.dims(); + af::dim4 const ostrides = output.strides(); + af::dim4 const istrides = input.strides(); + af::dim4 const pstrides = position.strides(); + af::dim4 const qstrides = qosition.strides(); + dim_t const oElems = output.elements(); + dim_t const iElems = input.elements(); + + approx2_op op; + bool pBatch = !(pdims[2] == 1 && pdims[3] == 1); + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims, + ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w); + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/assign.hpp b/src/backend/cpu/kernel/assign.hpp new file mode 100644 index 0000000000..86befaf74e --- /dev/null +++ b/src/backend/cpu/kernel/assign.hpp @@ -0,0 +1,80 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void assign(Array out, Array const rhs, std::vector const isSeq, + std::vector const seqs, std::vector< Array > const idxArrs) +{ + af::dim4 dDims = out.getDataDims(); + af::dim4 pDims = out.dims(); + // retrieve dimensions & strides for array to which rhs is being copied to + af::dim4 dst_offsets = toOffset(seqs, dDims); + af::dim4 dst_strides = toStride(seqs, dDims); + // retrieve rhs array dimenesions & strides + af::dim4 src_dims = rhs.dims(); + af::dim4 src_strides = rhs.strides(); + // declare pointers to af_array index data + uint const * const ptr0 = idxArrs[0].get(); + uint const * const ptr1 = idxArrs[1].get(); + uint const * const ptr2 = idxArrs[2].get(); + uint const * const ptr3 = idxArrs[3].get(); + + const T * src= rhs.get(); + T * dst = out.get(); + + for(dim_t l=0; l +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void bilateral(Array out, Array const in, float const s_sigma, float const c_sigma) +{ + af::dim4 const dims = in.dims(); + af::dim4 const istrides = in.strides(); + af::dim4 const ostrides = out.strides(); + + OutT *outData = out.get(); + InT const * inData = in.get(); + + // clamp spatical and chromatic sigma's + float space_ = std::min(11.5f, std::max(s_sigma, 0.f)); + float color_ = std::max(c_sigma, 0.f); + dim_t const radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1); + float const svar = space_*space_; + float const cvar = color_*color_; + + for(dim_t b3=0; b3 +#include + +namespace cpu +{ +namespace kernel +{ + +template +void one2one_1d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims, + af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & sStrides) +{ + dim_t start = (Expand ? 0 : fDims[0]/2); + dim_t end = (Expand ? oDims[0] : start + sDims[0]); + for(dim_t i=start; i=0 &&iIdx +void one2one_2d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims, + af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & oStrides, + af::dim4 const & sStrides, af::dim4 const & fStrides) +{ + dim_t jStart = (Expand ? 0 : fDims[1]/2); + dim_t jEnd = (Expand ? oDims[1] : jStart + sDims[1]); + dim_t iStart = (Expand ? 0 : fDims[0]/2); + dim_t iEnd = (Expand ? oDims[0] : iStart + sDims[0]); + + for(dim_t j=jStart; j=0 && jIdx=0 && iIdx +void one2one_3d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims, + af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & oStrides, + af::dim4 const & sStrides, af::dim4 const & fStrides) +{ + dim_t kStart = (Expand ? 0 : fDims[2]/2); + dim_t kEnd = (Expand ? oDims[2] : kStart + sDims[2]); + dim_t jStart = (Expand ? 0 : fDims[1]/2); + dim_t jEnd = (Expand ? oDims[1] : jStart + sDims[1]); + dim_t iStart = (Expand ? 0 : fDims[0]/2); + dim_t iEnd = (Expand ? oDims[0] : iStart + sDims[0]); + + for(dim_t k=kStart; k=0 && kIdx=0 && jIdx=0 && iIdx +void convolve_nd(Array out, Array const signal, Array const filter, ConvolveBatchKind kind) +{ + InT * optr = out.get(); + InT const * const iptr = signal.get(); + AccT const * const fptr = filter.get(); + + af::dim4 const oDims = out.dims(); + af::dim4 const sDims = signal.dims(); + af::dim4 const fDims = filter.dims(); + + af::dim4 const oStrides = out.strides(); + af::dim4 const sStrides = signal.strides(); + af::dim4 const fStrides = filter.strides(); + + dim_t out_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ + dim_t in_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ + dim_t filt_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ + dim_t batch[4] = {0, 1, 1, 1}; /* first value is never used, and declared for code simplicity */ + + for (dim_t i=1; i<4; ++i) { + switch(kind) { + case CONVOLVE_BATCH_SIGNAL: + out_step[i] = oStrides[i]; + in_step[i] = sStrides[i]; + if (i>=baseDim) batch[i] = sDims[i]; + break; + case CONVOLVE_BATCH_SAME: + out_step[i] = oStrides[i]; + in_step[i] = sStrides[i]; + filt_step[i] = fStrides[i]; + if (i>=baseDim) batch[i] = sDims[i]; + break; + case CONVOLVE_BATCH_KERNEL: + out_step[i] = oStrides[i]; + filt_step[i] = fStrides[i]; + if (i>=baseDim) batch[i] = fDims[i]; + break; + default: + break; + } + } + + for (dim_t b3=0; b3(out, in, filt, oDims, sDims, fDims, sStrides); break; + case 2: one2one_2d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; + case 3: one2one_3d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; + } + } + } + } +} + +template +void convolve2_separable(InT *optr, InT const * const iptr, AccT const * const fptr, + af::dim4 const & oDims, af::dim4 const & sDims, af::dim4 const & orgDims, dim_t fDim, + af::dim4 const & oStrides, af::dim4 const & sStrides, dim_t fStride) +{ + for(dim_t j=0; j>1); + + for(dim_t i=0; i>1); + + AccT accum = scalar(0); + + for(dim_t f=0; f=0 && offi=0 && cj(0)); + } else { + dim_t offj = cj - f; + bool isCIValid = ci>=0 && ci=0 && offj(0)); + } + + accum += AccT(s_val * f_val); + } + optr[iOff+jOff] = InT(accum); + } + } +} + +template +void convolve2(Array out, Array const signal, + Array const c_filter, Array const r_filter, + af::dim4 const tDims) +{ + Array temp = createEmptyArray(tDims); + + dim_t cflen = (dim_t)c_filter.elements(); + dim_t rflen = (dim_t)r_filter.elements(); + + auto oDims = out.dims(); + auto sDims = signal.dims(); + + auto oStrides = out.strides(); + auto sStrides = signal.strides(); + auto tStrides = temp.strides(); + + for (dim_t b3=0; b3(tptr, iptr, c_filter.get(), + tDims, sDims, sDims, cflen, + tStrides, sStrides, c_filter.strides()[0]); + + convolve2_separable(optr, tptr, r_filter.get(), + oDims, tDims, sDims, rflen, + oStrides, tStrides, r_filter.strides()[0]); + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/copy.hpp b/src/backend/cpu/kernel/copy.hpp new file mode 100644 index 0000000000..70d6705ec2 --- /dev/null +++ b/src/backend/cpu/kernel/copy.hpp @@ -0,0 +1,90 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void stridedCopy(T* dst, af::dim4 const & ostrides, T const * src, + af::dim4 const & dims, af::dim4 const & strides, unsigned dim) +{ + if(dim == 0) { + if(strides[dim] == 1) { + //FIXME: Check for errors / exceptions + memcpy(dst, src, dims[dim] * sizeof(T)); + } else { + for(dim_t i = 0; i < dims[dim]; i++) { + dst[i] = src[strides[dim]*i]; + } + } + } else { + for(dim_t i = dims[dim]; i > 0; i--) { + stridedCopy(dst, ostrides, src, dims, strides, dim - 1); + src += strides[dim]; + dst += ostrides[dim]; + } + } +} + +template +void copy(Array dst, Array const src, OutT default_value, double factor) +{ + af::dim4 src_dims = src.dims(); + af::dim4 dst_dims = dst.dims(); + af::dim4 src_strides = src.strides(); + af::dim4 dst_strides = dst.strides(); + + InT const * const src_ptr = src.get(); + OutT * dst_ptr = dst.get(); + + dim_t trgt_l = std::min(dst_dims[3], src_dims[3]); + dim_t trgt_k = std::min(dst_dims[2], src_dims[2]); + dim_t trgt_j = std::min(dst_dims[1], src_dims[1]); + dim_t trgt_i = std::min(dst_dims[0], src_dims[0]); + + for(dim_t l=0; l +#include + +namespace cpu +{ +namespace kernel +{ + +template +void diagCreate(Array out, Array const in, int const num) +{ + int batch = in.dims()[1]; + int size = out.dims()[0]; + + T const * iptr = in.get(); + T * optr = out.get(); + + for (int k = 0; k < batch; k++) { + for (int j = 0; j < size; j++) { + for (int i = 0; i < size; i++) { + T val = scalar(0); + if (i == j - num) { + val = (num > 0) ? iptr[i] : iptr[j]; + } + optr[i + j * out.strides()[1]] = val; + } + } + optr += out.strides()[2]; + iptr += in.strides()[1]; + } +} + +template +void diagExtract(Array out, Array const in, int const num) +{ + dim4 const odims = out.dims(); + dim4 const idims = in.dims(); + + int const i_off = (num > 0) ? (num * in.strides()[1]) : (-num); + + for (int l = 0; l < (int)odims[3]; l++) { + + for (int k = 0; k < (int)odims[2]; k++) { + const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; + T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; + + for (int i = 0; i < (int)odims[0]; i++) { + T val = scalar(0); + if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; + optr[i] = val; + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/diff.hpp b/src/backend/cpu/kernel/diff.hpp new file mode 100644 index 0000000000..1a3d7ba110 --- /dev/null +++ b/src/backend/cpu/kernel/diff.hpp @@ -0,0 +1,86 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void diff1(Array out, Array const in, int const dim) +{ + af::dim4 dims = out.dims(); + // Bool for dimension + bool is_dim0 = dim == 0; + bool is_dim1 = dim == 1; + bool is_dim2 = dim == 2; + bool is_dim3 = dim == 3; + + T const * const inPtr = in.get(); + T * outPtr = out.get(); + + // TODO: Improve this + for(dim_t l = 0; l < dims[3]; l++) { + for(dim_t k = 0; k < dims[2]; k++) { + for(dim_t j = 0; j < dims[1]; j++) { + for(dim_t i = 0; i < dims[0]; i++) { + // Operation: out[index] = in[index + 1 * dim_size] - in[index] + int idx = getIdx(in.strides(), i, j, k, l); + int jdx = getIdx(in.strides(), + i + is_dim0, j + is_dim1, + k + is_dim2, l + is_dim3); + int odx = getIdx(out.strides(), i, j, k, l); + outPtr[odx] = inPtr[jdx] - inPtr[idx]; + } + } + } + } +} + +template +void diff2(Array out, Array const in, int const dim) +{ + af::dim4 dims = out.dims(); + // Bool for dimension + bool is_dim0 = dim == 0; + bool is_dim1 = dim == 1; + bool is_dim2 = dim == 2; + bool is_dim3 = dim == 3; + + T const * const inPtr = in.get(); + T * outPtr = out.get(); + + // TODO: Improve this + for(dim_t l = 0; l < dims[3]; l++) { + for(dim_t k = 0; k < dims[2]; k++) { + for(dim_t j = 0; j < dims[1]; j++) { + for(dim_t i = 0; i < dims[0]; i++) { + // Operation: out[index] = in[index + 1 * dim_size] - in[index] + int idx = getIdx(in.strides(), i, j, k, l); + int jdx = getIdx(in.strides(), + i + is_dim0, j + is_dim1, + k + is_dim2, l + is_dim3); + int kdx = getIdx(in.strides(), + i + 2 * is_dim0, j + 2 * is_dim1, + k + 2 * is_dim2, l + 2 * is_dim3); + int odx = getIdx(out.strides(), i, j, k, l); + outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/dot.hpp b/src/backend/cpu/kernel/dot.hpp new file mode 100644 index 0000000000..71f2c6f959 --- /dev/null +++ b/src/backend/cpu/kernel/dot.hpp @@ -0,0 +1,46 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template T +conj(T x) { return x; } + +template<> cfloat conj (cfloat c) { return std::conj(c); } +template<> cdouble conj(cdouble c) { return std::conj(c); } + +template +void dot(Array output, const Array lhs, const Array rhs, + af_mat_prop optLhs, af_mat_prop optRhs) +{ + int N = lhs.dims()[0]; + + T out = 0; + const T *pL = lhs.get(); + const T *pR = rhs.get(); + + for(int i = 0; i < N; i++) + out += (conjugate ? kernel::conj(pL[i]) : pL[i]) * pR[i]; + + if(both_conjugate) out = kernel::conj(out); + + *output.get() = out; + +} + +} +} diff --git a/src/backend/cpu/kernel/fast.hpp b/src/backend/cpu/kernel/fast.hpp new file mode 100644 index 0000000000..02da3e4d33 --- /dev/null +++ b/src/backend/cpu/kernel/fast.hpp @@ -0,0 +1,224 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +inline int idx_y(int i) +{ + if (i >= 8) + return clamp(-(i-8-4), -3, 3); + + return clamp(i-4, -3, 3); +} + +inline int idx_x(int i) +{ + if (i < 12) + return idx_y(i+4); + + return idx_y(i-12); +} + +inline int idx(int y, int x, unsigned idim0) +{ + return x * idim0 + y; +} + +// test_greater() +// Tests if a pixel x > p + thr +inline int test_greater(float x, float p, float thr) +{ + return (x >= p + thr); +} + +// test_smaller() +// Tests if a pixel x < p - thr +inline int test_smaller(float x, float p, float thr) +{ + return (x <= p - thr); +} + +// test_pixel() +// Returns -1 when x < p - thr +// Returns 0 when x >= p - thr && x <= p + thr +// Returns 1 when x > p + thr +template +inline int test_pixel(const T* image, const float p, float thr, int y, int x, unsigned idim0) +{ + return -test_smaller((float)image[idx(y,x,idim0)], p, thr) | test_greater((float)image[idx(y,x,idim0)], p, thr); +} + +// abs_diff() +// Returns absolute difference of x and y +inline int abs_diff(int x, int y) +{ + return abs(x - y); +} +inline unsigned abs_diff(unsigned x, unsigned y) +{ + return (unsigned)abs((int)x - (int)y); +} +inline float abs_diff(float x, float y) +{ + return fabs(x - y); +} +inline double abs_diff(double x, double y) +{ + return fabs(x - y); +} + +template +void locate_features(Array const & in, Array & score, + Array & x_out, Array & y_out, + Array & score_out, unsigned* count, float const thr, + unsigned const arc_length, unsigned const nonmax, + unsigned const max_feat, unsigned const edge) +{ + af::dim4 in_dims = in.dims(); + T const * in_ptr = in.get(); + + for (int y = edge; y < (int)(in_dims[0] - edge); y++) { + for (int x = edge; x < (int)(in_dims[1] - edge); x++) { + float p = in_ptr[idx(y, x, in_dims[0])]; + + // Start by testing opposite pixels of the circle that will result in + // a non-kepoint + int d; + d = test_pixel(in_ptr, p, thr, y-3, x, in_dims[0]) | test_pixel(in_ptr, p, thr, y+3, x, in_dims[0]); + if (d == 0) + continue; + + d &= test_pixel(in_ptr, p, thr, y-2, x+2, in_dims[0]) | test_pixel(in_ptr, p, thr, y+2, x-2, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y , x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y , x-3, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y+2, x+2, in_dims[0]) | test_pixel(in_ptr, p, thr, y-2, x-2, in_dims[0]); + if (d == 0) + continue; + + d &= test_pixel(in_ptr, p, thr, y-3, x+1, in_dims[0]) | test_pixel(in_ptr, p, thr, y+3, x-1, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y-1, x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y+1, x-3, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y+1, x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y-1, x-3, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y+3, x+1, in_dims[0]) | test_pixel(in_ptr, p, thr, y-3, x-1, in_dims[0]); + if (d == 0) + continue; + + int sum = 0; + + // Sum responses [-1, 0 or 1] of first arc_length pixels + for (int i = 0; i < static_cast(arc_length); i++) + sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); + + // Test maximum and mininmum responses of first segment of arc_length + // pixels + int max_sum = 0, min_sum = 0; + max_sum = std::max(max_sum, sum); + min_sum = std::min(min_sum, sum); + + // Sum responses and test the remaining 16-arc_length pixels of the circle + for (int i = arc_length; i < 16; i++) { + sum -= test_pixel(in_ptr, p, thr, y+idx_y(i-arc_length), x+idx_x(i-arc_length), in_dims[0]); + sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); + max_sum = std::max(max_sum, sum); + min_sum = std::min(min_sum, sum); + } + + // To completely test all possible segments, it's necessary to test + // segments that include the top junction of the circle + for (int i = 0; i < static_cast(arc_length-1); i++) { + sum -= test_pixel(in_ptr, p, thr, y+idx_y(16-arc_length+i), x+idx_x(16-arc_length+i), in_dims[0]); + sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); + max_sum = std::max(max_sum, sum); + min_sum = std::min(min_sum, sum); + } + + float s_bright = 0, s_dark = 0; + for (int i = 0; i < 16; i++) { + float p_x = (float)in_ptr[idx(y+idx_y(i), x+idx_x(i), in_dims[0])]; + + s_bright += test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr); + s_dark += test_smaller(p_x, p, thr) * (abs_diff(p, p_x) - thr); + } + + // If sum at some point was equal to (+-)arc_length, there is a segment + // that for which all pixels are much brighter or much brighter than + // central pixel p. + if (max_sum == static_cast(arc_length) || min_sum == -static_cast(arc_length)) { + unsigned j = *count; + ++*count; + if (j < max_feat) { + float *x_out_ptr = x_out.get(); + float *y_out_ptr = y_out.get(); + float *score_out_ptr = score_out.get(); + x_out_ptr[j] = static_cast(x); + y_out_ptr[j] = static_cast(y); + score_out_ptr[j] = static_cast(std::max(s_bright, s_dark)); + if (nonmax == 1) { + float* score_ptr = score.get(); + score_ptr[idx(y, x, in_dims[0])] = std::max(s_bright, s_dark); + } + } + } + } + } +} + +void non_maximal(Array const & score, const Array & x_in, const Array & y_in, + Array & x_out, Array & y_out, Array & score_out, + unsigned* count, unsigned const total_feat, unsigned const edge) +{ + float const * score_ptr = score.get(); + float const * x_in_ptr = x_in.get(); + float const * y_in_ptr = y_in.get(); + + af::dim4 score_dims = score.dims(); + + for (unsigned k = 0; k < total_feat; k++) { + unsigned x = static_cast(round(x_in_ptr[k])); + unsigned y = static_cast(round(y_in_ptr[k])); + + float v = score_ptr[y + score_dims[0] * x]; + float max_v; + max_v = std::max(score_ptr[y-1 + score_dims[0] * (x-1)], score_ptr[y-1 + score_dims[0] * x]); + max_v = std::max(max_v, score_ptr[y-1 + score_dims[0] * (x+1)]); + max_v = std::max(max_v, score_ptr[y + score_dims[0] * (x-1)]); + max_v = std::max(max_v, score_ptr[y + score_dims[0] * (x+1)]); + max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x-1)]); + max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x) ]); + max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x+1)]); + + if (y >= score_dims[1] - edge - 1 || y <= edge + 1 || + x >= score_dims[0] - edge - 1 || x <= edge + 1) + continue; + + // Stores keypoint to feat_out if it's response is maximum compared to + // its 8-neighborhood + if (v > max_v) { + unsigned j = *count; + ++*count; + + float *x_out_ptr = x_out.get(); + float *y_out_ptr = y_out.get(); + float *score_out_ptr = score_out.get(); + + x_out_ptr[j] = static_cast(x); + y_out_ptr[j] = static_cast(y); + score_out_ptr[j] = static_cast(v); + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/fft.hpp b/src/backend/cpu/kernel/fft.hpp new file mode 100644 index 0000000000..906c8ef5f5 --- /dev/null +++ b/src/backend/cpu/kernel/fft.hpp @@ -0,0 +1,192 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void computeDims(int rdims[rank], const af::dim4 &idims) +{ + for (int i = 0; i < rank; i++) { + rdims[i] = idims[(rank -1) - i]; + } +} + +template +struct fftw_transform; + +#define TRANSFORM(PRE, TY) \ + template<> \ + struct fftw_transform \ + { \ + typedef PRE##_plan plan_t; \ + typedef PRE##_complex ctype_t; \ + \ + template \ + plan_t create(Args... args) \ + { return PRE##_plan_many_dft(args...); } \ + void execute(plan_t plan) { return PRE##_execute(plan); } \ + void destroy(plan_t plan) { return PRE##_destroy_plan(plan); } \ + }; \ + + +TRANSFORM(fftwf, cfloat) +TRANSFORM(fftw, cdouble) + +template +struct fftw_real_transform; + +#define TRANSFORM_REAL(PRE, To, Ti, POST) \ + template<> \ + struct fftw_real_transform \ + { \ + typedef PRE##_plan plan_t; \ + typedef PRE##_complex ctype_t; \ + \ + template \ + plan_t create(Args... args) \ + { return PRE##_plan_many_dft_##POST(args...); } \ + void execute(plan_t plan) { return PRE##_execute(plan); } \ + void destroy(plan_t plan) { return PRE##_destroy_plan(plan); } \ + }; \ + + +TRANSFORM_REAL(fftwf, cfloat , float , r2c) +TRANSFORM_REAL(fftw , cdouble, double, r2c) +TRANSFORM_REAL(fftwf, float , cfloat , c2r) +TRANSFORM_REAL(fftw , double, cdouble, c2r) + + +template +void fft_inplace(Array in) +{ + int t_dims[rank]; + int in_embed[rank]; + + const af::dim4 idims = in.dims(); + + computeDims(t_dims , idims); + computeDims(in_embed , in.getDataDims()); + + const af::dim4 istrides = in.strides(); + + typedef typename fftw_transform::ctype_t ctype_t; + typename fftw_transform::plan_t plan; + + fftw_transform transform; + + int batch = 1; + for (int i = rank; i < 4; i++) { + batch *= idims[i]; + } + + plan = transform.create(rank, + t_dims, + (int)batch, + (ctype_t *)in.get(), + in_embed, (int)istrides[0], + (int)istrides[rank], + (ctype_t *)in.get(), + in_embed, (int)istrides[0], + (int)istrides[rank], + direction ? FFTW_FORWARD : FFTW_BACKWARD, + FFTW_ESTIMATE); + + transform.execute(plan); + transform.destroy(plan); +} + +template +void fft_r2c(Array out, const Array in) +{ + af::dim4 idims = in.dims(); + + int t_dims[rank]; + int in_embed[rank]; + int out_embed[rank]; + + computeDims(t_dims , idims); + computeDims(in_embed , in.getDataDims()); + computeDims(out_embed , out.getDataDims()); + + const af::dim4 istrides = in.strides(); + const af::dim4 ostrides = out.strides(); + + typedef typename fftw_real_transform::ctype_t ctype_t; + typename fftw_real_transform::plan_t plan; + + fftw_real_transform transform; + + int batch = 1; + for (int i = rank; i < 4; i++) { + batch *= idims[i]; + } + + plan = transform.create(rank, + t_dims, + (int)batch, + (Tr *)in.get(), + in_embed, (int)istrides[0], + (int)istrides[rank], + (ctype_t *)out.get(), + out_embed, (int)ostrides[0], + (int)ostrides[rank], + FFTW_ESTIMATE); + + transform.execute(plan); + transform.destroy(plan); +} + +template +void fft_c2r(Array out, const Array in, const af::dim4 odims) +{ + int t_dims[rank]; + int in_embed[rank]; + int out_embed[rank]; + + computeDims(t_dims , odims); + computeDims(in_embed , in.getDataDims()); + computeDims(out_embed , out.getDataDims()); + + const af::dim4 istrides = in.strides(); + const af::dim4 ostrides = out.strides(); + + typedef typename fftw_real_transform::ctype_t ctype_t; + typename fftw_real_transform::plan_t plan; + + fftw_real_transform transform; + + int batch = 1; + for (int i = rank; i < 4; i++) { + batch *= odims[i]; + } + + plan = transform.create(rank, + t_dims, + (int)batch, + (ctype_t *)in.get(), + in_embed, (int)istrides[0], + (int)istrides[rank], + (Tr *)out.get(), + out_embed, (int)ostrides[0], + (int)ostrides[rank], + FFTW_ESTIMATE); + + transform.execute(plan); + transform.destroy(plan); +} + +} +} diff --git a/src/backend/cpu/kernel/fftconvolve.hpp b/src/backend/cpu/kernel/fftconvolve.hpp new file mode 100644 index 0000000000..ad586f7d28 --- /dev/null +++ b/src/backend/cpu/kernel/fftconvolve.hpp @@ -0,0 +1,256 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void packData(Array out, const af::dim4 od, const af::dim4 os, Array const in) +{ + To* out_ptr = out.get(); + + const af::dim4 id = in.dims(); + const af::dim4 is = in.strides(); + const Ti* in_ptr = in.get(); + + int id0_half = divup(id[0], 2); + bool odd_id0 = (id[0] % 2 == 1); + + for (int d3 = 0; d3 < (int)od[3]; d3++) { + for (int d2 = 0; d2 < (int)od[2]; d2++) { + for (int d1 = 0; d1 < (int)od[1]; d1++) { + for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { + const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + + if (d0 < (int)id0_half && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) { + const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0; + out_ptr[oidx] = (To)in_ptr[iidx]; + if (d0 == id0_half-1 && odd_id0) + out_ptr[oidx+1] = (To)0; + else + out_ptr[oidx+1] = (To)in_ptr[iidx+id0_half]; + } + else { + // Pad remaining elements with 0s + out_ptr[oidx] = (To)0; + out_ptr[oidx+1] = (To)0; + } + } + } + } + } +} + +template +void padArray(Array out, const af::dim4 od, const af::dim4 os, + Array const in, const dim_t offset) +{ + To* out_ptr = out.get() + offset; + const af::dim4 id = in.dims(); + const af::dim4 is = in.strides(); + const Ti* in_ptr = in.get(); + + for (int d3 = 0; d3 < (int)od[3]; d3++) { + for (int d2 = 0; d2 < (int)od[2]; d2++) { + for (int d1 = 0; d1 < (int)od[1]; d1++) { + for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { + const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + + if (d0 < (int)id[0] && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) { + // Copy input elements to real elements, set imaginary elements to 0 + const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0; + out_ptr[oidx] = (To)in_ptr[iidx]; + out_ptr[oidx+1] = (To)0; + } + else { + // Pad remaining of the matrix to 0s + out_ptr[oidx] = (To)0; + out_ptr[oidx+1] = (To)0; + } + } + } + } + } +} + +template +void complexMultiply(Array packed, const af::dim4 sig_dims, const af::dim4 sig_strides, + const af::dim4 fit_dims, const af::dim4 fit_strides, + ConvolveBatchKind kind, const dim_t offset) +{ + T* out_ptr = packed.get() + (kind==CONVOLVE_BATCH_KERNEL? offset : 0); + T* in1_ptr = packed.get(); + T* in2_ptr = packed.get() + offset; + + const af::dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims); + const af::dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides); + const af::dim4& i1d = sig_dims; + const af::dim4& i2d = fit_dims; + const af::dim4& i1s = sig_strides; + const af::dim4& i2s = fit_strides; + + for (int d3 = 0; d3 < (int)od[3]; d3++) { + for (int d2 = 0; d2 < (int)od[2]; d2++) { + for (int d1 = 0; d1 < (int)od[1]; d1++) { + for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { + if (kind == CONVOLVE_BATCH_NONE || kind == CONVOLVE_BATCH_SAME) { + // Complex multiply each signal to equivalent filter + const int ridx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + const int iidx = ridx + 1; + + T a = in1_ptr[ridx]; + T b = in1_ptr[iidx]; + T c = in2_ptr[ridx]; + T d = in2_ptr[iidx]; + + T ac = a*c; + T bd = b*d; + + out_ptr[ridx] = ac - bd; + out_ptr[iidx] = (a+b) * (c+d) - ac - bd; + } + else if (kind == CONVOLVE_BATCH_SIGNAL) { + // Complex multiply all signals to filter + const int ridx1 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + const int iidx1 = ridx1 + 1; + const int ridx2 = ridx1 % (i2s[3] * i2d[3]); + const int iidx2 = iidx1 % (i2s[3] * i2d[3]); + + T a = in1_ptr[ridx1]; + T b = in1_ptr[iidx1]; + T c = in2_ptr[ridx2]; + T d = in2_ptr[iidx2]; + + T ac = a*c; + T bd = b*d; + + out_ptr[ridx1] = ac - bd; + out_ptr[iidx1] = (a+b) * (c+d) - ac - bd; + } + else if (kind == CONVOLVE_BATCH_KERNEL) { + // Complex multiply signal to all filters + const int ridx2 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + const int iidx2 = ridx2 + 1; + const int ridx1 = ridx2 % (i1s[3] * i1d[3]); + const int iidx1 = iidx2 % (i1s[3] * i1d[3]); + + T a = in1_ptr[ridx1]; + T b = in1_ptr[iidx1]; + T c = in2_ptr[ridx2]; + T d = in2_ptr[iidx2]; + + T ac = a*c; + T bd = b*d; + + out_ptr[ridx2] = ac - bd; + out_ptr[iidx2] = (a+b) * (c+d) - ac - bd; + } + } + } + } + } +} + +template +void reorderHelper(To* out_ptr, const af::dim4& od, const af::dim4& os, + const Ti* in_ptr, const af::dim4& id, const af::dim4& is, + const af::dim4& fd, const int half_di0, const int baseDim, + const int fftScale, const bool expand) +{ + for (int d3 = 0; d3 < (int)od[3]; d3++) { + for (int d2 = 0; d2 < (int)od[2]; d2++) { + for (int d1 = 0; d1 < (int)od[1]; d1++) { + for (int d0 = 0; d0 < (int)od[0]; d0++) { + int id0, id1, id2, id3; + if (expand) { + id0 = d0; + id1 = d1 * is[1]; + id2 = d2 * is[2]; + id3 = d3 * is[3]; + } + else { + id0 = d0 + fd[0]/2; + id1 = (d1 + (baseDim > 1)*(fd[1]/2)) * is[1]; + id2 = (d2 + (baseDim > 2)*(fd[2]/2)) * is[2]; + id3 = d3 * is[3]; + } + + int oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0; + + // Divide output elements to cuFFT resulting scale, round result if output + // type is single or double precision floating-point + if (id0 < half_di0) { + // Copy top elements + int iidx = id3 + id2 + id1 + id0 * 2; + if (roundOut) + out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale)); + else + out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale); + } + else if (id0 < half_di0 + (int)fd[0] - 1) { + // Add signal and filter elements to central part + int iidx1 = id3 + id2 + id1 + id0 * 2; + int iidx2 = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1; + if (roundOut) + out_ptr[oidx] = (To)roundf((float)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale)); + else + out_ptr[oidx] = (To)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale); + } + else { + // Copy bottom elements + const int iidx = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1; + if (roundOut) + out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale)); + else + out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale); + } + } + } + } + } +} + +template +void reorder(Array out, Array packed, + const Array filter, const dim_t sig_half_d0, const dim_t fftScale, + const dim4 sig_tmp_dims, const dim4 sig_tmp_strides, + const dim4 filter_tmp_dims, const dim4 filter_tmp_strides, + bool expand, ConvolveBatchKind kind) +{ + T* out_ptr = out.get(); + const af::dim4 out_dims = out.dims(); + const af::dim4 out_strides = out.strides(); + + const af::dim4 filter_dims = filter.dims(); + + convT* packed_ptr = packed.get(); + convT* sig_tmp_ptr = packed_ptr; + convT* filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3]; + + // Reorder the output + if (kind == CONVOLVE_BATCH_KERNEL) { + reorderHelper(out_ptr, out_dims, out_strides, + filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, + filter_dims, sig_half_d0, baseDim, fftScale, expand); + } else { + reorderHelper(out_ptr, out_dims, out_strides, + sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, + filter_dims, sig_half_d0, baseDim, fftScale, expand); + } +} + +} +} diff --git a/src/backend/cpu/kernel/gradient.hpp b/src/backend/cpu/kernel/gradient.hpp new file mode 100644 index 0000000000..1ab01abb0f --- /dev/null +++ b/src/backend/cpu/kernel/gradient.hpp @@ -0,0 +1,89 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void gradient(Array grad0, Array grad1, Array const in) +{ + const af::dim4 dims = in.dims(); + + T *d_grad0 = grad0.get(); + T *d_grad1 = grad1.get(); + const T *d_in = in.get(); + + const af::dim4 inst = in.strides(); + const af::dim4 g0st = grad0.strides(); + const af::dim4 g1st = grad1.strides(); + + T v5 = scalar(0.5); + T v1 = scalar(1.0); + + for(dim_t idw = 0; idw < dims[3]; idw++) { + const dim_t inW = idw * inst[3]; + const dim_t g0W = idw * g0st[3]; + const dim_t g1W = idw * g1st[3]; + for(dim_t idz = 0; idz < dims[2]; idz++) { + const dim_t inZW = inW + idz * inst[2]; + const dim_t g0ZW = g0W + idz * g0st[2]; + const dim_t g1ZW = g1W + idz * g1st[2]; + dim_t xl, xr, yl,yr; + T f0, f1; + for(dim_t idy = 0; idy < dims[1]; idy++) { + const dim_t inYZW = inZW + idy * inst[1]; + const dim_t g0YZW = g0ZW + idy * g0st[1]; + const dim_t g1YZW = g1ZW + idy * g1st[1]; + if(idy == 0) { + yl = inYZW + inst[1]; + yr = inYZW; + f1 = v1; + } else if(idy == dims[1] - 1) { + yl = inYZW; + yr = inYZW - inst[1]; + f1 = v1; + } else { + yl = inYZW + inst[1]; + yr = inYZW - inst[1]; + f1 = v5; + } + for(dim_t idx = 0; idx < dims[0]; idx++) { + const dim_t inMem = inYZW + idx; + const dim_t g0Mem = g0YZW + idx; + const dim_t g1Mem = g1YZW + idx; + if(idx == 0) { + xl = inMem + 1; + xr = inMem; + f0 = v1; + } else if(idx == dims[0] - 1) { + xl = inMem; + xr = inMem - 1; + f0 = v1; + } else { + xl = inMem + 1; + xr = inMem - 1; + f0 = v5; + } + + d_grad0[g0Mem] = f0 * (d_in[xl] - d_in[xr]); + d_grad1[g1Mem] = f1 * (d_in[yl + idx] - d_in[yr + idx]); + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/harris.hpp b/src/backend/cpu/kernel/harris.hpp new file mode 100644 index 0000000000..183cf37e77 --- /dev/null +++ b/src/backend/cpu/kernel/harris.hpp @@ -0,0 +1,124 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void second_order_deriv(Array ixx, Array ixy, Array iyy, + const unsigned in_len, const Array ix, const Array iy) +{ + T* ixx_out = ixx.get(); + T* ixy_out = ixy.get(); + T* iyy_out = iyy.get(); + const T* ix_in = ix.get(); + const T* iy_in = iy.get(); + for (unsigned x = 0; x < in_len; x++) { + ixx_out[x] = ix_in[x] * ix_in[x]; + ixy_out[x] = ix_in[x] * iy_in[x]; + iyy_out[x] = iy_in[x] * iy_in[x]; + } +} + +template +void harris_responses(Array resp, const unsigned idim0, const unsigned idim1, + const Array ixx, const Array ixy, const Array iyy, + const float k_thr, const unsigned border_len) +{ + T* resp_out = resp.get(); + const T* ixx_in = ixx.get(); + const T* ixy_in = ixy.get(); + const T* iyy_in = iyy.get(); + const unsigned r = border_len; + + for (unsigned x = r; x < idim1 - r; x++) { + for (unsigned y = r; y < idim0 - r; y++) { + const unsigned idx = x * idim0 + y; + + // Calculates matrix trace and determinant + T tr = ixx_in[idx] + iyy_in[idx]; + T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx]; + + // Calculates local Harris response + resp_out[idx] = det - k_thr * (tr*tr); + } + } +} + +template +void non_maximal(Array xOut, Array yOut, Array respOut, unsigned* count, + const unsigned idim0, const unsigned idim1, const Array respIn, + const float min_resp, const unsigned border_len, const unsigned max_corners) +{ + float* x_out = xOut.get(); + float* y_out = yOut.get(); + float* resp_out = respOut.get(); + const T* resp_in = respIn.get(); + // Responses on the border don't have 8-neighbors to compare, discard them + const unsigned r = border_len + 1; + + for (unsigned x = r; x < idim1 - r; x++) { + for (unsigned y = r; y < idim0 - r; y++) { + const T v = resp_in[x * idim0 + y]; + + // Find maximum neighborhood response + T max_v; + max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]); + max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]); + max_v = max(max_v, resp_in[(x-1) * idim0 + y ]); + max_v = max(max_v, resp_in[(x+1) * idim0 + y ]); + max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]); + max_v = max(max_v, resp_in[(x) * idim0 + y+1]); + max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]); + + // Stores corner to {x,y,resp}_out if it's response is maximum compared + // to its 8-neighborhood and greater or equal minimum response + if (v > max_v && v >= (T)min_resp) { + const unsigned idx = *count; + *count += 1; + if (idx < max_corners) { + x_out[idx] = (float)x; + y_out[idx] = (float)y; + resp_out[idx] = (float)v; + } + } + } + } +} + +static void keep_corners(Array xOut, Array yOut, Array respOut, + const Array xIn, const Array yIn, + const Array respIn, const Array respIdx, + const unsigned n_corners) +{ + float* x_out = xOut.get(); + float* y_out = yOut.get(); + float* resp_out = respOut.get(); + const float* x_in = xIn.get(); + const float* y_in = yIn.get(); + const float* resp_in = respIn.get(); + const uint* resp_idx = respIdx.get(); + + // Keep only the first n_feat features + for (unsigned f = 0; f < n_corners; f++) { + x_out[f] = x_in[resp_idx[f]]; + y_out[f] = y_in[resp_idx[f]]; + resp_out[f] = resp_in[f]; + } +} + +} +} diff --git a/src/backend/cpu/kernel/histogram.hpp b/src/backend/cpu/kernel/histogram.hpp new file mode 100644 index 0000000000..9b9b897c02 --- /dev/null +++ b/src/backend/cpu/kernel/histogram.hpp @@ -0,0 +1,49 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void histogram(Array out, Array const in, + unsigned const nbins, double const minval, double const maxval) +{ + dim4 const outDims = out.dims(); + float const step = (maxval - minval)/(float)nbins; + dim4 const inDims = in.dims(); + dim4 const iStrides = in.strides(); + dim4 const oStrides = out.strides(); + dim_t const nElems = inDims[0]*inDims[1]; + + OutT *outData = out.get(); + const InT* inData= in.get(); + + for(dim_t b3 = 0; b3 < outDims[3]; b3++) { + for(dim_t b2 = 0; b2 < outDims[2]; b2++) { + for(dim_t i=0; i +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void hsv2rgb(Array out, Array const in) +{ + const af::dim4 dims = in.dims(); + const af::dim4 strides = in.strides(); + dim_t obStride = out.strides()[3]; + dim_t coff = strides[2]; + dim_t bCount = dims[3]; + + for(dim_t b=0; b +void rgb2hsv(Array out, Array const in) +{ + const af::dim4 dims = in.dims(); + const af::dim4 strides = in.strides(); + af::dim4 oStrides = out.strides(); + dim_t bCount = dims[3]; + + for(dim_t b=0; b +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void identity(Array out) +{ + T *ptr = out.get(); + const af::dim4 out_dims = out.dims(); + + for (dim_t k = 0; k < out_dims[2] * out_dims[3]; k++) { + for (dim_t j = 0; j < out_dims[1]; j++) { + for (dim_t i = 0; i < out_dims[0]; i++) { + ptr[j * out_dims[0] + i] = (i == j) ? scalar(1) : scalar(0); + } + } + ptr += out_dims[0] * out_dims[1]; + } +} + +} +} diff --git a/src/backend/cpu/kernel/iir.hpp b/src/backend/cpu/kernel/iir.hpp new file mode 100644 index 0000000000..5182094fc2 --- /dev/null +++ b/src/backend/cpu/kernel/iir.hpp @@ -0,0 +1,61 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void iir(Array y, Array c, Array const a) +{ + dim4 ydims = c.dims(); + int num_a = a.dims()[0]; + + for (int l = 0; l < (int)ydims[3]; l++) { + dim_t yidx3 = l * y.strides()[3]; + dim_t cidx3 = l * c.strides()[3]; + dim_t aidx3 = l * a.strides()[3]; + + for (int k = 0; k < (int)ydims[2]; k++) { + + dim_t yidx2 = k * y.strides()[2] + yidx3; + dim_t cidx2 = k * c.strides()[2] + cidx3; + dim_t aidx2 = k * a.strides()[2] + aidx3; + + for (int j = 0; j < (int)ydims[1]; j++) { + + dim_t yidx1 = j * y.strides()[1] + yidx2; + dim_t cidx1 = j * c.strides()[1] + cidx2; + dim_t aidx1 = j * a.strides()[1] + aidx2; + + std::vector h_z(num_a); + + const T *h_a = a.get() + (a.ndims() > 1 ? aidx1 : 0); + T *h_c = c.get() + cidx1; + T *h_y = y.get() + yidx1; + + for (int i = 0; i < (int)ydims[0]; i++) { + + T y = h_y[i] = (h_c[i] + h_z[0]) / h_a[0]; + for (int ii = 1; ii < num_a; ii++) { + h_z[ii - 1] = h_z[ii] - h_a[ii] * y; + } + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/index.hpp b/src/backend/cpu/kernel/index.hpp new file mode 100644 index 0000000000..343d7ae4e7 --- /dev/null +++ b/src/backend/cpu/kernel/index.hpp @@ -0,0 +1,71 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void index(Array out, Array const in, + std::vector const isSeq, std::vector const seqs, + std::vector< Array > const idxArrs) +{ + const af::dim4 iDims = in.dims(); + const af::dim4 dDims = in.getDataDims(); + const af::dim4 iOffs = toOffset(seqs, dDims); + const af::dim4 iStrds = toStride(seqs, dDims); + const af::dim4 oDims = out.dims(); + const af::dim4 oStrides = out.strides(); + const T *src = in.get(); + T *dst = out.get(); + const uint* ptr0 = idxArrs[0].get(); + const uint* ptr1 = idxArrs[1].get(); + const uint* ptr2 = idxArrs[2].get(); + const uint* ptr3 = idxArrs[3].get(); + + for (dim_t l=0; l +#include + +namespace cpu +{ +namespace kernel +{ + +template +void iota(Array output, const af::dim4 &sdims, const af::dim4 &tdims) +{ + const af::dim4 dims = output.dims(); + T* out = output.get(); + const af::dim4 strides = output.strides(); + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + T valY = valZ + (y % sdims[1]) * sdims[0]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + out[id] = valY + (x % sdims[0]); + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/ireduce.hpp b/src/backend/cpu/kernel/ireduce.hpp new file mode 100644 index 0000000000..848885515b --- /dev/null +++ b/src/backend/cpu/kernel/ireduce.hpp @@ -0,0 +1,108 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template double cabs(const T in) { return (double)in; } +static double cabs(const char in) { return (double)(in > 0); } +static double cabs(const cfloat &in) { return (double)abs(in); } +static double cabs(const cdouble &in) { return (double)abs(in); } + +template +struct MinMaxOp +{ + T m_val; + uint m_idx; + MinMaxOp(T val, uint idx) : + m_val(val), m_idx(idx) + { + } + + void operator()(T val, uint idx) + { + if (cabs(val) < cabs(m_val) || + (cabs(val) == cabs(m_val) && + idx > m_idx)) { + m_val = val; + m_idx = idx; + } + } +}; + +template +struct MinMaxOp +{ + T m_val; + uint m_idx; + MinMaxOp(T val, uint idx) : + m_val(val), m_idx(idx) + { + } + + void operator()(T val, uint idx) + { + if (cabs(val) > cabs(m_val) || + (cabs(val) == cabs(m_val) && + idx <= m_idx)) { + m_val = val; + m_idx = idx; + } + } +}; + +template +struct ireduce_dim +{ + void operator()(Array output, Array locArray, const dim_t outOffset, + const Array input, const dim_t inOffset, const int dim) + { + const af::dim4 odims = output.dims(); + const af::dim4 ostrides = output.strides(); + const af::dim4 istrides = input.strides(); + const int D1 = D - 1; + for (dim_t i = 0; i < odims[D1]; i++) { + ireduce_dim()(output, locArray, outOffset + i * ostrides[D1], + input, inOffset + i * istrides[D1], dim); + } + } +}; + +template +struct ireduce_dim +{ + void operator()(Array output, Array locArray, const dim_t outOffset, + const Array input, const dim_t inOffset, const int dim) + { + const af::dim4 idims = input.dims(); + const af::dim4 istrides = input.strides(); + + T const * const in = input.get(); + T * out = output.get(); + uint * loc = locArray.get(); + + dim_t stride = istrides[dim]; + MinMaxOp Op(in[inOffset], 0); + for (dim_t i = 0; i < idims[dim]; i++) { + Op(in[inOffset + i * stride], i); + } + + out[outOffset] = Op.m_val; + loc[outOffset] = Op.m_idx; + } +}; + +} +} diff --git a/src/backend/cpu/kernel/join.hpp b/src/backend/cpu/kernel/join.hpp new file mode 100644 index 0000000000..b0d92c9978 --- /dev/null +++ b/src/backend/cpu/kernel/join.hpp @@ -0,0 +1,144 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +af::dim4 calcOffset(const af::dim4 dims) +{ + af::dim4 offset; + offset[0] = (dim == 0) ? dims[0] : 0; + offset[1] = (dim == 1) ? dims[1] : 0; + offset[2] = (dim == 2) ? dims[2] : 0; + offset[3] = (dim == 3) ? dims[3] : 0; + return offset; +} + +template +void join_append(To *out, const Tx *X, const af::dim4 &offset, + const af::dim4 &odims, const af::dim4 &xdims, + const af::dim4 &ost, const af::dim4 &xst) +{ + for(dim_t ow = 0; ow < xdims[3]; ow++) { + const dim_t xW = ow * xst[3]; + const dim_t oW = (ow + offset[3]) * ost[3]; + + for(dim_t oz = 0; oz < xdims[2]; oz++) { + const dim_t xZW = xW + oz * xst[2]; + const dim_t oZW = oW + (oz + offset[2]) * ost[2]; + + for(dim_t oy = 0; oy < xdims[1]; oy++) { + const dim_t xYZW = xZW + oy * xst[1]; + const dim_t oYZW = oZW + (oy + offset[1]) * ost[1]; + + for(dim_t ox = 0; ox < xdims[0]; ox++) { + const dim_t iMem = xYZW + ox; + const dim_t oMem = oYZW + (ox + offset[0]); + out[oMem] = X[iMem]; + } + } + } + } +} + +template +void join(Array out, const int dim, const Array first, const Array second) +{ + Tx* outPtr = out.get(); + const Tx* fptr = first.get(); + const Ty* sptr = second.get(); + + af::dim4 zero(0,0,0,0); + const af::dim4 odims = out.dims(); + const af::dim4 fdims = first.dims(); + const af::dim4 sdims = second.dims(); + + switch(dim) { + case 0: + join_append(outPtr, fptr, zero, + odims, fdims, out.strides(), first.strides()); + join_append(outPtr, sptr, calcOffset<0>(fdims), + odims, sdims, out.strides(), second.strides()); + break; + case 1: + join_append(outPtr, fptr, zero, + odims, fdims, out.strides(), first.strides()); + join_append(outPtr, sptr, calcOffset<1>(fdims), + odims, sdims, out.strides(), second.strides()); + break; + case 2: + join_append(outPtr, fptr, zero, + odims, fdims, out.strides(), first.strides()); + join_append(outPtr, sptr, calcOffset<2>(fdims), + odims, sdims, out.strides(), second.strides()); + break; + case 3: + join_append(outPtr, fptr, zero, + odims, fdims, out.strides(), first.strides()); + join_append(outPtr, sptr, calcOffset<3>(fdims), + odims, sdims, out.strides(), second.strides()); + break; + } +} + +template +void join(const int dim, Array out, const std::vector> inputs) +{ + af::dim4 zero(0,0,0,0); + af::dim4 d = zero; + switch(dim) { + case 0: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<0>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 1: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<1>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 2: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<2>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 3: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<3>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + } +} + +} +} + diff --git a/src/backend/cpu/kernel/lookup.hpp b/src/backend/cpu/kernel/lookup.hpp new file mode 100644 index 0000000000..a290ef2fca --- /dev/null +++ b/src/backend/cpu/kernel/lookup.hpp @@ -0,0 +1,62 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void lookup(Array out, Array const input, + Array const indices, unsigned const dim) +{ + const af::dim4 iDims = input.dims(); + const af::dim4 oDims = out.dims(); + const af::dim4 iStrides = input.strides(); + const af::dim4 oStrides = out.strides(); + const InT *inPtr = input.get(); + const IndexT *idxPtr = indices.get(); + + InT *outPtr = out.get(); + + for (dim_t l=0; l +#include + +namespace cpu +{ +namespace kernel +{ + +template +void lu_split(Array lower, Array upper, const Array in) +{ + T *l = lower.get(); + T *u = upper.get(); + const T *i = in.get(); + + af::dim4 ldm = lower.dims(); + af::dim4 udm = upper.dims(); + af::dim4 idm = in.dims(); + af::dim4 lst = lower.strides(); + af::dim4 ust = upper.strides(); + af::dim4 ist = in.strides(); + + for(dim_t ow = 0; ow < idm[3]; ow++) { + const dim_t lW = ow * lst[3]; + const dim_t uW = ow * ust[3]; + const dim_t iW = ow * ist[3]; + + for(dim_t oz = 0; oz < idm[2]; oz++) { + const dim_t lZW = lW + oz * lst[2]; + const dim_t uZW = uW + oz * ust[2]; + const dim_t iZW = iW + oz * ist[2]; + + for(dim_t oy = 0; oy < idm[1]; oy++) { + const dim_t lYZW = lZW + oy * lst[1]; + const dim_t uYZW = uZW + oy * ust[1]; + const dim_t iYZW = iZW + oy * ist[1]; + + for(dim_t ox = 0; ox < idm[0]; ox++) { + const dim_t lMem = lYZW + ox; + const dim_t uMem = uYZW + ox; + const dim_t iMem = iYZW + ox; + if(ox > oy) { + if(oy < ldm[1]) l[lMem] = i[iMem]; + if(ox < udm[0]) u[uMem] = scalar(0); + } else if (oy > ox) { + if(oy < ldm[1]) l[lMem] = scalar(0); + if(ox < udm[0]) u[uMem] = i[iMem]; + } else if(ox == oy) { + if(oy < ldm[1]) l[lMem] = scalar(1.0); + if(ox < udm[0]) u[uMem] = i[iMem]; + } + } + } + } + } +} + +void convertPivot(Array p, Array pivot) +{ + int *d_pi = pivot.get(); + int *d_po = p.get(); + dim_t d0 = pivot.dims()[0]; + for(int j = 0; j < (int)d0; j++) { + // 1 indexed in pivot + std::swap(d_po[j], d_po[d_pi[j] - 1]); + } +} + +} +} diff --git a/src/backend/cpu/kernel/match_template.hpp b/src/backend/cpu/kernel/match_template.hpp new file mode 100644 index 0000000000..ae41364018 --- /dev/null +++ b/src/backend/cpu/kernel/match_template.hpp @@ -0,0 +1,141 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void matchTemplate(Array out, const Array sImg, const Array tImg) +{ + const af::dim4 sDims = sImg.dims(); + const af::dim4 tDims = tImg.dims(); + const af::dim4 sStrides = sImg.strides(); + const af::dim4 tStrides = tImg.strides(); + + const dim_t tDim0 = tDims[0]; + const dim_t tDim1 = tDims[1]; + const dim_t sDim0 = sDims[0]; + const dim_t sDim1 = sDims[1]; + + const af::dim4 oStrides = out.strides(); + + OutT tImgMean = OutT(0); + dim_t winNumElements = tImg.elements(); + bool needMean = MatchT==AF_ZSAD || MatchT==AF_LSAD || + MatchT==AF_ZSSD || MatchT==AF_LSSD || + MatchT==AF_ZNCC; + const InT * tpl = tImg.get(); + + if (needMean) { + for(dim_t tj=0; tj +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void meanShift(Array out, const Array in, const float s_sigma, + const float c_sigma, const unsigned iter) +{ + const af::dim4 dims = in.dims(); + const af::dim4 istrides = in.strides(); + const af::dim4 ostrides = out.strides(); + + const dim_t bCount = (IsColor ? 1 : dims[2]); + const dim_t channels = (IsColor ? dims[2] : 1); + + // clamp spatical and chromatic sigma's + float space_ = std::min(11.5f, s_sigma); + const dim_t radius = std::max((int)(space_ * 1.5f), 1); + const float cvar = c_sigma*c_sigma; + + std::vector means(channels); + std::vector centers(channels); + std::vector tmpclrs(channels); + + T *outData = out.get(); + const T * inData = in.get(); + + for(dim_t b3=0; b31 + // i.e for color images where batch is along fourth dimension + centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]]; + } + + // scope of meanshift iterationd begin + for(unsigned it=0; it +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void medfilt(Array out, const Array in, dim_t w_len, dim_t w_wid) +{ + const af::dim4 dims = in.dims(); + const af::dim4 istrides = in.strides(); + const af::dim4 ostrides = out.strides(); + + std::vector wind_vals; + wind_vals.reserve(w_len*w_wid); + + T const * in_ptr = in.get(); + T * out_ptr = out.get(); + + for(int b3=0; b3<(int)dims[3]; b3++) { + + for(int b2=0; b2<(int)dims[2]; b2++) { + + for(int col=0; col<(int)dims[1]; col++) { + + int ocol_off = col*ostrides[1]; + + for(int row=0; row<(int)dims[0]; row++) { + + wind_vals.clear(); + + for(int wj=0; wj<(int)w_wid; ++wj) { + + bool isColOff = false; + + int im_col = col + wj-w_wid/2; + int im_coff; + switch(Pad) { + case AF_PAD_ZERO: + im_coff = im_col * istrides[1]; + if (im_col < 0 || im_col>=(int)dims[1]) + isColOff = true; + break; + case AF_PAD_SYM: + { + if (im_col < 0) { + im_col *= -1; + isColOff = true; + } + + if (im_col>=(int)dims[1]) { + im_col = 2*((int)dims[1]-1) - im_col; + isColOff = true; + } + + im_coff = im_col * istrides[1]; + } + break; + } + + for(int wi=0; wi<(int)w_len; ++wi) { + + bool isRowOff = false; + + int im_row = row + wi-w_len/2; + int im_roff; + switch(Pad) { + case AF_PAD_ZERO: + im_roff = im_row * istrides[0]; + if (im_row < 0 || im_row>=(int)dims[0]) + isRowOff = true; + break; + case AF_PAD_SYM: + { + if (im_row < 0) { + im_row *= -1; + isRowOff = true; + } + + if (im_row>=(int)dims[0]) { + im_row = 2*((int)dims[0]-1) - im_row; + isRowOff = true; + } + + im_roff = im_row * istrides[0]; + } + break; + } + + if(isRowOff || isColOff) { + switch(Pad) { + case AF_PAD_ZERO: + wind_vals.push_back(0); + break; + case AF_PAD_SYM: + wind_vals.push_back(in_ptr[im_coff+im_roff]); + break; + } + } else + wind_vals.push_back(in_ptr[im_coff+im_roff]); + } + } + + std::stable_sort(wind_vals.begin(),wind_vals.end()); + int off = wind_vals.size()/2; + if (wind_vals.size()%2==0) + out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2; + else { + out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off]; + } + } + } + in_ptr += istrides[2]; + out_ptr += ostrides[2]; + } + } +} + + +} +} diff --git a/src/backend/cpu/kernel/morph.hpp b/src/backend/cpu/kernel/morph.hpp new file mode 100644 index 0000000000..af9b7e9373 --- /dev/null +++ b/src/backend/cpu/kernel/morph.hpp @@ -0,0 +1,140 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void morph(Array out, Array const in, Array const mask) +{ + const af::dim4 ostrides = out.strides(); + const af::dim4 istrides = in.strides(); + const af::dim4 fstrides = mask.strides(); + const af::dim4 dims = in.dims(); + const af::dim4 window = mask.dims(); + T* outData = out.get(); + const T* inData = in.get(); + const T* filter = mask.get(); + const dim_t R0 = window[0]/2; + const dim_t R1 = window[1]/2; + + for(dim_t b3=0; b3 (T)0) && offi>=0 && offj>=0 && offi +void morph3d(Array out, Array const in, Array const mask) +{ + const af::dim4 dims = in.dims(); + const af::dim4 window = mask.dims(); + const dim_t R0 = window[0]/2; + const dim_t R1 = window[1]/2; + const dim_t R2 = window[2]/2; + const af::dim4 istrides = in.strides(); + const af::dim4 fstrides = mask.strides(); + const dim_t bCount = dims[3]; + const af::dim4 ostrides = out.strides(); + T* outData = out.get(); + const T* inData = in.get(); + const T* filter = mask.get(); + + for(dim_t batchId=0; batchId (T)0) && offi>=0 && offj>=0 && offk>=0 && + offi +#include + +namespace cpu +{ +namespace kernel +{ + +#if defined(_WIN32) || defined(_MSC_VER) + +#include +#define __builtin_popcount __popcnt + +#endif + +template +struct dist_op +{ + To operator()(T v1, T v2) + { + return v1 - v2; // Garbage distance + } +}; + +template +struct dist_op +{ + To operator()(T v1, T v2) + { + return std::abs((double)v1 - (double)v2); + } +}; + +template +struct dist_op +{ + To operator()(T v1, T v2) + { + return (v1 - v2) * (v1 - v2); + } +}; + +template +struct dist_op +{ + To operator()(uint v1, uint v2) + { + return __builtin_popcount(v1 ^ v2); + } +}; + +template +struct dist_op +{ + To operator()(uintl v1, uintl v2) + { + return __builtin_popcount(v1 ^ v2); + } +}; + +template +struct dist_op +{ + To operator()(uchar v1, uchar v2) + { + return __builtin_popcount(v1 ^ v2); + } +}; + +template +struct dist_op +{ + To operator()(ushort v1, ushort v2) + { + return __builtin_popcount(v1 ^ v2); + } +}; + +template +void nearest_neighbour(Array idx, Array dist, + const Array query, const Array train, + const uint dist_dim, const uint n_dist) +{ + uint sample_dim = (dist_dim == 0) ? 1 : 0; + const dim4 qDims = query.dims(); + const dim4 tDims = train.dims(); + + const unsigned distLength = qDims[dist_dim]; + const unsigned nQuery = qDims[sample_dim]; + const unsigned nTrain = tDims[sample_dim]; + + const T* qPtr = query.get(); + const T* tPtr = train.get(); + uint* iPtr = idx.get(); + To* dPtr = dist.get(); + + dist_op op; + + for (unsigned i = 0; i < nQuery; i++) { + To best_dist = limit_max(); + unsigned best_idx = 0; + + for (unsigned j = 0; j < nTrain; j++) { + To local_dist = 0; + for (unsigned k = 0; k < distLength; k++) { + size_t qIdx, tIdx; + if (sample_dim == 0) { + qIdx = k * qDims[0] + i; + tIdx = k * tDims[0] + j; + } + else { + qIdx = i * qDims[0] + k; + tIdx = j * tDims[0] + k; + } + + local_dist += op(qPtr[qIdx], tPtr[tIdx]); + } + + if (local_dist < best_dist) { + best_dist = local_dist; + best_idx = j; + } + } + + size_t oIdx; + oIdx = i; + iPtr[oIdx] = best_idx; + dPtr[oIdx] = best_dist; + } +} + +} +} diff --git a/src/backend/cpu/kernel/orb.hpp b/src/backend/cpu/kernel/orb.hpp new file mode 100644 index 0000000000..acd508cb70 --- /dev/null +++ b/src/backend/cpu/kernel/orb.hpp @@ -0,0 +1,509 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +// Reference pattern, generated for a patch size of 31x31, as suggested by +// original ORB paper +#define REF_PAT_SIZE 31 +#define REF_PAT_SAMPLES 256 +#define REF_PAT_COORDS 4 +#define REF_PAT_LENGTH (REF_PAT_SAMPLES*REF_PAT_COORDS) + +// Current reference pattern was borrowed from OpenCV, to build a pattern with +// similar quality, a training process must be applied, as described in +// sections 4.2 and 4.3 of the original ORB paper. +const int ref_pat[REF_PAT_LENGTH] = { + 8,-3, 9,5, + 4,2, 7,-12, + -11,9, -8,2, + 7,-12, 12,-13, + 2,-13, 2,12, + 1,-7, 1,6, + -2,-10, -2,-4, + -13,-13, -11,-8, + -13,-3, -12,-9, + 10,4, 11,9, + -13,-8, -8,-9, + -11,7, -9,12, + 7,7, 12,6, + -4,-5, -3,0, + -13,2, -12,-3, + -9,0, -7,5, + 12,-6, 12,-1, + -3,6, -2,12, + -6,-13, -4,-8, + 11,-13, 12,-8, + 4,7, 5,1, + 5,-3, 10,-3, + 3,-7, 6,12, + -8,-7, -6,-2, + -2,11, -1,-10, + -13,12, -8,10, + -7,3, -5,-3, + -4,2, -3,7, + -10,-12, -6,11, + 5,-12, 6,-7, + 5,-6, 7,-1, + 1,0, 4,-5, + 9,11, 11,-13, + 4,7, 4,12, + 2,-1, 4,4, + -4,-12, -2,7, + -8,-5, -7,-10, + 4,11, 9,12, + 0,-8, 1,-13, + -13,-2, -8,2, + -3,-2, -2,3, + -6,9, -4,-9, + 8,12, 10,7, + 0,9, 1,3, + 7,-5, 11,-10, + -13,-6, -11,0, + 10,7, 12,1, + -6,-3, -6,12, + 10,-9, 12,-4, + -13,8, -8,-12, + -13,0, -8,-4, + 3,3, 7,8, + 5,7, 10,-7, + -1,7, 1,-12, + 3,-10, 5,6, + 2,-4, 3,-10, + -13,0, -13,5, + -13,-7, -12,12, + -13,3, -11,8, + -7,12, -4,7, + 6,-10, 12,8, + -9,-1, -7,-6, + -2,-5, 0,12, + -12,5, -7,5, + 3,-10, 8,-13, + -7,-7, -4,5, + -3,-2, -1,-7, + 2,9, 5,-11, + -11,-13, -5,-13, + -1,6, 0,-1, + 5,-3, 5,2, + -4,-13, -4,12, + -9,-6, -9,6, + -12,-10, -8,-4, + 10,2, 12,-3, + 7,12, 12,12, + -7,-13, -6,5, + -4,9, -3,4, + 7,-1, 12,2, + -7,6, -5,1, + -13,11, -12,5, + -3,7, -2,-6, + 7,-8, 12,-7, + -13,-7, -11,-12, + 1,-3, 12,12, + 2,-6, 3,0, + -4,3, -2,-13, + -1,-13, 1,9, + 7,1, 8,-6, + 1,-1, 3,12, + 9,1, 12,6, + -1,-9, -1,3, + -13,-13, -10,5, + 7,7, 10,12, + 12,-5, 12,9, + 6,3, 7,11, + 5,-13, 6,10, + 2,-12, 2,3, + 3,8, 4,-6, + 2,6, 12,-13, + 9,-12, 10,3, + -8,4, -7,9, + -11,12, -4,-6, + 1,12, 2,-8, + 6,-9, 7,-4, + 2,3, 3,-2, + 6,3, 11,0, + 3,-3, 8,-8, + 7,8, 9,3, + -11,-5, -6,-4, + -10,11, -5,10, + -5,-8, -3,12, + -10,5, -9,0, + 8,-1, 12,-6, + 4,-6, 6,-11, + -10,12, -8,7, + 4,-2, 6,7, + -2,0, -2,12, + -5,-8, -5,2, + 7,-6, 10,12, + -9,-13, -8,-8, + -5,-13, -5,-2, + 8,-8, 9,-13, + -9,-11, -9,0, + 1,-8, 1,-2, + 7,-4, 9,1, + -2,1, -1,-4, + 11,-6, 12,-11, + -12,-9, -6,4, + 3,7, 7,12, + 5,5, 10,8, + 0,-4, 2,8, + -9,12, -5,-13, + 0,7, 2,12, + -1,2, 1,7, + 5,11, 7,-9, + 3,5, 6,-8, + -13,-4, -8,9, + -5,9, -3,-3, + -4,-7, -3,-12, + 6,5, 8,0, + -7,6, -6,12, + -13,6, -5,-2, + 1,-10, 3,10, + 4,1, 8,-4, + -2,-2, 2,-13, + 2,-12, 12,12, + -2,-13, 0,-6, + 4,1, 9,3, + -6,-10, -3,-5, + -3,-13, -1,1, + 7,5, 12,-11, + 4,-2, 5,-7, + -13,9, -9,-5, + 7,1, 8,6, + 7,-8, 7,6, + -7,-4, -7,1, + -8,11, -7,-8, + -13,6, -12,-8, + 2,4, 3,9, + 10,-5, 12,3, + -6,-5, -6,7, + 8,-3, 9,-8, + 2,-12, 2,8, + -11,-2, -10,3, + -12,-13, -7,-9, + -11,0, -10,-5, + 5,-3, 11,8, + -2,-13, -1,12, + -1,-8, 0,9, + -13,-11, -12,-5, + -10,-2, -10,11, + -3,9, -2,-13, + 2,-3, 3,2, + -9,-13, -4,0, + -4,6, -3,-10, + -4,12, -2,-7, + -6,-11, -4,9, + 6,-3, 6,11, + -13,11, -5,5, + 11,11, 12,6, + 7,-5, 12,-2, + -1,12, 0,7, + -4,-8, -3,-2, + -7,1, -6,7, + -13,-12, -8,-13, + -7,-2, -6,-8, + -8,5, -6,-9, + -5,-1, -4,5, + -13,7, -8,10, + 1,5, 5,-13, + 1,0, 10,-13, + 9,12, 10,-1, + 5,-8, 10,-9, + -1,11, 1,-13, + -9,-3, -6,2, + -1,-10, 1,12, + -13,1, -8,-10, + 8,-11, 10,-6, + 2,-13, 3,-6, + 7,-13, 12,-9, + -10,-10, -5,-7, + -10,-8, -8,-13, + 4,-6, 8,5, + 3,12, 8,-13, + -4,2, -3,-3, + 5,-13, 10,-12, + 4,-13, 5,-1, + -9,9, -4,3, + 0,3, 3,-9, + -12,1, -6,1, + 3,2, 4,-8, + -10,-10, -10,9, + 8,-13, 12,12, + -8,-12, -6,-5, + 2,2, 3,7, + 10,6, 11,-8, + 6,8, 8,-12, + -7,10, -6,5, + -3,-9, -3,9, + -1,-13, -1,5, + -3,-7, -3,4, + -8,-2, -8,3, + 4,2, 12,12, + 2,-5, 3,11, + 6,-9, 11,-13, + 3,-1, 7,12, + 11,-1, 12,4, + -3,0, -3,6, + 4,-11, 4,12, + 2,-4, 2,1, + -10,-6, -8,1, + -13,7, -11,1, + -13,12, -11,-13, + 6,0, 11,-13, + 0,-1, 1,4, + -13,3, -9,-2, + -9,8, -6,-3, + -13,-6, -8,-2, + 5,-9, 8,10, + 2,7, 3,-9, + -1,-6, -1,-1, + 9,5, 11,-2, + 11,-3, 12,-8, + 3,0, 3,5, + -1,4, 0,10, + 3,-6, 4,5, + -13,0, -10,5, + 5,8, 12,11, + 8,9, 9,-6, + 7,-4, 8,-12, + -10,4, -10,9, + 7,3, 12,4, + 9,-7, 10,-2, + 7,0, 12,-2, + -1,-6, 0,-11, +}; + +template +void keep_features( + float* x_out, + float* y_out, + float* score_out, + float* size_out, + const float* x_in, + const float* y_in, + const float* score_in, + const unsigned* score_idx, + const float* size_in, + const unsigned n_feat) +{ + // Keep only the first n_feat features + for (unsigned f = 0; f < n_feat; f++) { + x_out[f] = x_in[score_idx[f]]; + y_out[f] = y_in[score_idx[f]]; + score_out[f] = score_in[f]; + if (size_in != nullptr && size_out != nullptr) + size_out[f] = size_in[score_idx[f]]; + } +} + +template +void harris_response( + float* x_out, + float* y_out, + float* score_out, + float* size_out, + const float* x_in, + const float* y_in, + const float* scl_in, + const unsigned total_feat, + unsigned* usable_feat, + const Array& image, + const unsigned block_size, + const float k_thr, + const unsigned patch_size) +{ + const af::dim4 idims = image.dims(); + const T* image_ptr = image.get(); + for (unsigned f = 0; f < total_feat; f++) { + unsigned x, y; + float scl = 1.f; + if (use_scl) { + // Update x and y coordinates according to scale + scl = scl_in[f]; + x = (unsigned)round(x_in[f] * scl); + y = (unsigned)round(y_in[f] * scl); + } + else { + x = (unsigned)round(x_in[f]); + y = (unsigned)round(y_in[f]); + } + + // Round feature size to nearest odd integer + float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f; + + // Avoid keeping features that might be too wide and might not fit on + // the image, sqrt(2.f) is the radius when angle is 45 degrees and + // represents widest case possible + unsigned patch_r = ceil(size * sqrt(2.f) / 2.f); + if (x < patch_r || y < patch_r || x >= idims[1] - patch_r || y >= idims[0] - patch_r) + continue; + + unsigned r = block_size / 2; + + float ixx = 0.f, iyy = 0.f, ixy = 0.f; + unsigned block_size_sq = block_size * block_size; + for (unsigned k = 0; k < block_size_sq; k++) { + int i = k / block_size - r; + int j = k % block_size - r; + + // Calculate local x and y derivatives + float ix = image_ptr[(x+i+1) * idims[0] + y+j] - image_ptr[(x+i-1) * idims[0] + y+j]; + float iy = image_ptr[(x+i) * idims[0] + y+j+1] - image_ptr[(x+i) * idims[0] + y+j-1]; + + // Accumulate second order derivatives + ixx += ix*ix; + iyy += iy*iy; + ixy += ix*iy; + } + + unsigned idx = *usable_feat; + *usable_feat += 1; + float tr = ixx + iyy; + float det = ixx*iyy - ixy*ixy; + + // Calculate Harris responses + float resp = det - k_thr * (tr*tr); + + // Scale factor + // TODO: improve response scaling + float rscale = 0.001f; + rscale = rscale * rscale * rscale * rscale; + + x_out[idx] = x; + y_out[idx] = y; + score_out[idx] = resp * rscale; + if (use_scl) + size_out[idx] = size; + } +} + +template +void centroid_angle( + const float* x_in, + const float* y_in, + float* orientation_out, + const unsigned total_feat, + const Array& image, + const unsigned patch_size) +{ + const af::dim4 idims = image.dims(); + const T* image_ptr = image.get(); + for (unsigned f = 0; f < total_feat; f++) { + unsigned x = (unsigned)round(x_in[f]); + unsigned y = (unsigned)round(y_in[f]); + + unsigned r = patch_size / 2; + if (x < r || y < r || x > idims[1] - r || y > idims[0] - r) + continue; + + T m01 = (T)0, m10 = (T)0; + unsigned patch_size_sq = patch_size * patch_size; + for (unsigned k = 0; k < patch_size_sq; k++) { + int i = k / patch_size - r; + int j = k % patch_size - r; + + // Calculate first order moments + T p = image_ptr[(x+i) * idims[0] + y+j]; + m01 += j * p; + m10 += i * p; + } + + float angle = atan2(m01, m10); + orientation_out[f] = angle; + } +} + +template +inline T get_pixel( + unsigned x, + unsigned y, + const float ori, + const unsigned size, + const int dist_x, + const int dist_y, + const Array& image, + const unsigned patch_size) +{ + const af::dim4 idims = image.dims(); + const T* image_ptr = image.get(); + float ori_sin = sin(ori); + float ori_cos = cos(ori); + float patch_scl = (float)size / (float)patch_size; + + // Calculate point coordinates based on orientation and size + x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin); + y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos); + + return image_ptr[x * idims[0] + y]; +} + +template +void extract_orb( + unsigned* desc_out, + const unsigned n_feat, + float* x_in_out, + float* y_in_out, + const float* ori_in, + float* size_out, + const Array& image, + const float scl, + const unsigned patch_size) +{ + const af::dim4 idims = image.dims(); + for (unsigned f = 0; f < n_feat; f++) { + unsigned x = (unsigned)round(x_in_out[f]); + unsigned y = (unsigned)round(y_in_out[f]); + float ori = ori_in[f]; + unsigned size = patch_size; + + unsigned r = ceil(patch_size * sqrt(2.f) / 2.f); + if (x < r || y < r || x >= idims[1] - r || y >= idims[0] - r) + continue; + + // Descriptor fixed at 256 bits for now + // Storing descriptor as a vector of 8 x 32-bit unsigned numbers + for (unsigned i = 0; i < 8; i++) { + unsigned v = 0; + + // j < 32 for 256 bits descriptor + for (unsigned j = 0; j < 32; j++) { + // Get position from distribution pattern and values of points p1 and p2 + int dist_x = ref_pat[i*32*4 + j*4]; + int dist_y = ref_pat[i*32*4 + j*4+1]; + T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); + + dist_x = ref_pat[i*32*4 + j*4+2]; + dist_y = ref_pat[i*32*4 + j*4+3]; + T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); + + // Calculate bit based on p1 and p2 and shifts it to correct position + v |= (p1 < p2) << j; + } + + // Store 32 bits of descriptor + desc_out[f * 8 + i] += v; + } + + x_in_out[f] = round(x * scl); + y_in_out[f] = round(y * scl); + size_out[f] = patch_size * scl; + } +} + + + +} +} diff --git a/src/backend/cpu/kernel/random.hpp b/src/backend/cpu/kernel/random.hpp new file mode 100644 index 0000000000..9c59a64db9 --- /dev/null +++ b/src/backend/cpu/kernel/random.hpp @@ -0,0 +1,200 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +using namespace std; + +#if defined(_WIN32) + #define __THREAD_LOCAL static __declspec(thread) +#else + #define __THREAD_LOCAL static __thread +#endif + +template +using is_arithmetic_t = typename enable_if< is_arithmetic::value, function>::type; +template +using is_complex_t = typename enable_if< is_complex::value, function>::type; +template +using is_floating_point_t = typename enable_if< is_floating_point::value, function>::type; + +template +is_arithmetic_t +urand(GenType &generator) +{ + typedef typename conditional< is_floating_point::value, + uniform_real_distribution, +#if OS_WIN + uniform_int_distribution>::type dist; +#else + uniform_int_distribution> ::type dist; +#endif + return bind(dist(), generator); +} + +template +is_complex_t +urand(GenType &generator) +{ + auto func = urand(generator); + return [func] () { return T(func(), func());}; +} + +template +is_floating_point_t +nrand(GenType &generator) +{ + return bind(normal_distribution(), generator); +} + +template +is_complex_t +nrand(GenType &generator) +{ + auto func = nrand(generator); + return [func] () { return T(func(), func());}; +} + +mt19937& getGenerator() +{ + // FIXME: This abomination of a work around is brought to you + // by incomplete standards from Xcode and Visual Studio + // Should ideally be using thread_local on object instead of pointer + __THREAD_LOCAL mt19937 *generator = NULL; + if (generator == NULL) generator = new mt19937(); + return *generator; +} + +unsigned long long& getSeed() +{ + __THREAD_LOCAL unsigned long long gen_seed = 0; + return gen_seed; +} + +void getSeedPtr(unsigned long long *seed) +{ + *seed = getSeed(); +} + +bool& isFirst() +{ + __THREAD_LOCAL bool is_first = true; + return is_first; +} + +void setSeed(const uintl seed) +{ + getGenerator().seed(seed); + getSeed() = seed; + isFirst() = false; +} + +//FIXME: See if we can use functors instead of function pointer directly +template +struct RandomDistribution +{ + std::function func; + RandomDistribution(std::function dist_func) : func(dist_func) + { + } +}; + +template +void randn(Array out) +{ + __THREAD_LOCAL unsigned long long my_seed = 0; + if (isFirst()) { + my_seed = getSeed(); + setSeed(my_seed); + } + + // FIXME: This abomination of a work around is brought to you + // by incomplete standards from Xcode and Visual Studio + // Should ideally be using thread_local on object instead of pointer + __THREAD_LOCAL RandomDistribution *distPtr = NULL; + + if (!distPtr || my_seed != getSeed()) { + if (distPtr) delete distPtr; + distPtr = new RandomDistribution(nrand(getGenerator())); + my_seed = getSeed(); + } + + T *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { + outPtr[i] = distPtr->func(); + } +} + +template +void randu(Array out) +{ + __THREAD_LOCAL unsigned long long my_seed = 0; + if (isFirst()) { + my_seed = getSeed(); + setSeed(my_seed); + } + + // FIXME: This abomination of a work around is brought to you + // by incomplete standards from Xcode and Visual Studio + // Should ideally be using thread_local on object instead of pointer + __THREAD_LOCAL RandomDistribution *distPtr = NULL; + + if (!distPtr || my_seed != getSeed()) { + if (distPtr) delete distPtr; + distPtr = new RandomDistribution(urand(getGenerator())); + my_seed = getSeed(); + } + + T *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { + outPtr[i] = distPtr->func(); + } +} + +template<> +void randu(Array out) +{ + __THREAD_LOCAL unsigned long long my_seed = 0; + if (isFirst()) { + my_seed = getSeed(); + setSeed(my_seed); + } + + // FIXME: This abomination of a work around is brought to you + // by incomplete standards from Xcode and Visual Studio + // Should ideally be using thread_local on object instead of pointer + __THREAD_LOCAL RandomDistribution *distPtr = NULL; + + if (!distPtr || my_seed != getSeed()) { + if (distPtr) delete distPtr; + distPtr = new RandomDistribution(nrand(getGenerator())); + my_seed = getSeed(); + } + + char *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { + outPtr[i] = distPtr->func() > 0.5; + } +} + +} +} diff --git a/src/backend/cpu/kernel/range.hpp b/src/backend/cpu/kernel/range.hpp new file mode 100644 index 0000000000..b244a19c85 --- /dev/null +++ b/src/backend/cpu/kernel/range.hpp @@ -0,0 +1,52 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void range(Array output) +{ + T* out = output.get(); + + const dim4 dims = output.dims(); + const dim4 strides = output.strides(); + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + if(dim == 0) { + out[id] = x; + } else if(dim == 1) { + out[id] = y; + } else if(dim == 2) { + out[id] = z; + } else if(dim == 3) { + out[id] = w; + } + } + } + } + } +} + +} +} + diff --git a/src/backend/cpu/kernel/reduce.hpp b/src/backend/cpu/kernel/reduce.hpp new file mode 100644 index 0000000000..85119dcee7 --- /dev/null +++ b/src/backend/cpu/kernel/reduce.hpp @@ -0,0 +1,71 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +struct reduce_dim +{ + void operator()(Array out, const dim_t outOffset, + const Array in, const dim_t inOffset, + const int dim, bool change_nan, double nanval) + { + static const int D1 = D - 1; + static reduce_dim reduce_dim_next; + + const af::dim4 ostrides = out.strides(); + const af::dim4 istrides = in.strides(); + const af::dim4 odims = out.dims(); + + for (dim_t i = 0; i < odims[D1]; i++) { + reduce_dim_next(out, outOffset + i * ostrides[D1], + in, inOffset + i * istrides[D1], + dim, change_nan, nanval); + } + } +}; + +template +struct reduce_dim +{ + + Transform transform; + Binary reduce; + void operator()(Array out, const dim_t outOffset, + const Array in, const dim_t inOffset, + const int dim, bool change_nan, double nanval) + { + const af::dim4 istrides = in.strides(); + const af::dim4 idims = in.dims(); + + To * const outPtr = out.get() + outOffset; + Ti const * const inPtr = in.get() + inOffset; + dim_t stride = istrides[dim]; + + To out_val = reduce.init(); + for (dim_t i = 0; i < idims[dim]; i++) { + To in_val = transform(inPtr[i * stride]); + if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; + out_val = reduce(in_val, out_val); + } + + *outPtr = out_val; + } +}; + + +} +} diff --git a/src/backend/cpu/kernel/regions.hpp b/src/backend/cpu/kernel/regions.hpp new file mode 100644 index 0000000000..863ebc5f48 --- /dev/null +++ b/src/backend/cpu/kernel/regions.hpp @@ -0,0 +1,194 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +class LabelNode +{ +private: + T label; + T minLabel; + unsigned rank; + LabelNode* parent; + +public: + LabelNode() : label(0), minLabel(0), rank(0), parent(this) { } + LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { } + + T getLabel() + { + return label; + } + + T getMinLabel() + { + return minLabel; + } + + LabelNode* getParent() + { + return parent; + } + + unsigned getRank() + { + return rank; + } + + void setMinLabel(T l) + { + minLabel = l; + } + + void setParent(LabelNode* p) + { + parent = p; + } + + void setRank(unsigned r) + { + rank = r; + } +}; + +template +static LabelNode* find(LabelNode* x) +{ + if (x->getParent() != x) + x->setParent(find(x->getParent())); + return x->getParent(); +} + +template +static void setUnion(LabelNode* x, LabelNode* y) +{ + LabelNode* xRoot = find(x); + LabelNode* yRoot = find(y); + if (xRoot == yRoot) + return; + + T xMinLabel = xRoot->getMinLabel(); + T yMinLabel = yRoot->getMinLabel(); + xRoot->setMinLabel(min(xMinLabel, yMinLabel)); + yRoot->setMinLabel(min(xMinLabel, yMinLabel)); + + if (xRoot->getRank() < yRoot->getRank()) + xRoot->setParent(yRoot); + else if (xRoot->getRank() > yRoot->getRank()) + yRoot->setParent(xRoot); + else { + yRoot->setParent(xRoot); + xRoot->setRank(xRoot->getRank() + 1); + } +} + +template +void regions(Array out, const Array in, af_connectivity connectivity) +{ + const af::dim4 in_dims = in.dims(); + const char *in_ptr = in.get(); + T *out_ptr = out.get(); + + // Map labels + typedef typename std::map* > label_map_t; + typedef typename label_map_t::iterator label_map_iterator_t; + + label_map_t lmap; + + // Initial label + T label = (T)1; + + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * in_dims[0] + i; + if (in_ptr[idx] != 0) { + std::vector l; + + // Test neighbors + if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0) + l.push_back(out_ptr[j * in_dims[0] + i-1]); + if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i]); + if (connectivity == AF_CONNECTIVITY_8 && i > 0 && + j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]); + if (connectivity == AF_CONNECTIVITY_8 && + i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]); + + if (!l.empty()) { + T minl = l[0]; + for (size_t k = 0; k < l.size(); k++) { + minl = min(l[k], minl); + label_map_iterator_t cur_map = lmap.find(l[k]); + LabelNode *node = cur_map->second; + // Group labels of the same region under a disjoint set + for (size_t m = k+1; m < l.size(); m++) + setUnion(node, lmap.find(l[m])->second); + } + // Set label to smallest neighbor label + out_ptr[idx] = minl; + } + else { + // Insert new label in map + LabelNode *node = new LabelNode(label); + lmap.insert(std::pair* >(label, node)); + out_ptr[idx] = label++; + } + } + } + } + + std::set removed; + + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * (int)in_dims[0] + i; + if (in_ptr[idx] != 0) { + T l = out_ptr[idx]; + label_map_iterator_t cur_map = lmap.find(l); + + if (cur_map != lmap.end()) { + LabelNode* node = cur_map->second; + + LabelNode* node_root = find(node); + out_ptr[idx] = node_root->getMinLabel(); + + // Mark removed labels (those that are part of a region + // that contains a smaller label) + if (node->getMinLabel() < l || node_root->getMinLabel() < l) + removed.insert(l); + if (node->getLabel() > node->getMinLabel()) + removed.insert(node->getLabel()); + } + } + } + } + + // Calculate final neighbors (ensure final labels are sequential) + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * (int)in_dims[0] + i; + if (out_ptr[idx] > 0) { + out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx])); + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/reorder.hpp b/src/backend/cpu/kernel/reorder.hpp new file mode 100644 index 0000000000..c10c96ef36 --- /dev/null +++ b/src/backend/cpu/kernel/reorder.hpp @@ -0,0 +1,55 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void reorder(Array out, const Array in, const af::dim4 oDims, const af::dim4 rdims) +{ + T* outPtr = out.get(); + const T* inPtr = in.get(); + + const af::dim4 ist = in.strides(); + const af::dim4 ost = out.strides(); + + + dim_t ids[4] = {0}; + for(dim_t ow = 0; ow < oDims[3]; ow++) { + const dim_t oW = ow * ost[3]; + ids[rdims[3]] = ow; + for(dim_t oz = 0; oz < oDims[2]; oz++) { + const dim_t oZW = oW + oz * ost[2]; + ids[rdims[2]] = oz; + for(dim_t oy = 0; oy < oDims[1]; oy++) { + const dim_t oYZW = oZW + oy * ost[1]; + ids[rdims[1]] = oy; + for(dim_t ox = 0; ox < oDims[0]; ox++) { + const dim_t oIdx = oYZW + ox; + + ids[rdims[0]] = ox; + const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] + + ids[1] * ist[1] + ids[0]; + + outPtr[oIdx] = inPtr[iIdx]; + } + } + } + } +} + +} +} + diff --git a/src/backend/cpu/kernel/resize.hpp b/src/backend/cpu/kernel/resize.hpp new file mode 100644 index 0000000000..19d7ec7cf1 --- /dev/null +++ b/src/backend/cpu/kernel/resize.hpp @@ -0,0 +1,177 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +/** + * noop function for round to avoid compilation + * issues due to lack of this function in C90 based + * compilers, it is only present in C99 and C++11 + * + * This is not a full fledged implementation, this function + * is to be used only for positive numbers, i m using it here + * for calculating dimensions of arrays + */ +dim_t round2int(float value) +{ + return (dim_t)(value+0.5f); +} + +using std::conditional; +using std::is_same; + +template +using wtype_t = typename conditional::value, double, float>::type; + +template +using vtype_t = typename conditional::value, + T, wtype_t + >::type; + +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) + { + return; + } +}; + +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) + { + // Compute Indices + dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0])); + dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1])); + + if (i_x >= idims[0]) i_x = idims[0] - 1; + if (i_y >= idims[1]) i_y = idims[1] - 1; + + dim_t i_off = i_y * istrides[1] + i_x; + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wost = w * ostrides[3]; + dim_t wist = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; + } + } + } +}; + +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) + { + // Compute Indices + float f_x = (float)x / (odims[0] / (float)idims[0]); + float f_y = (float)y / (odims[1] / (float)idims[1]); + + dim_t i1_x = floor(f_x); + dim_t i1_y = floor(f_y); + + if (i1_x >= idims[0]) i1_x = idims[0] - 1; + if (i1_y >= idims[1]) i1_y = idims[1] - 1; + + float b = f_x - i1_x; + float a = f_y - i1_y; + + dim_t i2_x = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1); + dim_t i2_y = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1); + + typedef typename dtype_traits::base_type BT; + typedef wtype_t WT; + typedef vtype_t VT; + + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wst = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + dim_t zst = z * istrides[2]; + dim_t channel_off = zst + wst; + VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off]; + VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off]; + VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off]; + VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off]; + + outPtr[o_off + z * ostrides[2] + w * ostrides[3]] = + scalar((1.0f - a) * (1.0f - b)) * p1 + + scalar(( a ) * (1.0f - b)) * p2 + + scalar((1.0f - a) * ( b )) * p3 + + scalar(( a ) * ( b )) * p4; + } + } + } +}; + +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) + { + // Compute Indices + dim_t i_x = floor((float)x / (odims[0] / (float)idims[0])); + dim_t i_y = floor((float)y / (odims[1] / (float)idims[1])); + + if (i_x >= idims[0]) i_x = idims[0] - 1; + if (i_y >= idims[1]) i_y = idims[1] - 1; + + dim_t i_off = i_y * istrides[1] + i_x; + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wost = w * ostrides[3]; + dim_t wist = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; + } + } + } +}; + +template +void resize(Array out, const Array in) +{ + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + const T *inPtr = in.get(); + T *outPtr = out.get(); + af::dim4 ostrides = out.strides(); + af::dim4 istrides = in.strides(); + + resize_op op; + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y); + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/rotate.hpp b/src/backend/cpu/kernel/rotate.hpp new file mode 100644 index 0000000000..395ea3f303 --- /dev/null +++ b/src/backend/cpu/kernel/rotate.hpp @@ -0,0 +1,84 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void rotate(Array output, const Array input, const float theta) +{ + const af::dim4 odims = output.dims(); + const af::dim4 idims = input.dims(); + const af::dim4 ostrides = output.strides(); + const af::dim4 istrides = input.strides(); + + const T* in = input.get(); + T* out = output.get(); + dim_t nimages = idims[2]; + + void (*t_fn)(T *, const T *, const float *, const af::dim4 &, + const af::dim4 &, const af::dim4 &, + const dim_t, const dim_t, const dim_t, const dim_t, + const bool); + + const float c = cos(-theta), s = sin(-theta); + float tx, ty; + { + const float nx = 0.5 * (idims[0] - 1); + const float ny = 0.5 * (idims[1] - 1); + const float mx = 0.5 * (odims[0] - 1); + const float my = 0.5 * (odims[1] - 1); + const float sx = (mx * c + my *-s); + const float sy = (mx * s + my * c); + tx = -(sx - nx); + ty = -(sy - ny); + } + + const float tmat[6] = {std::round( c * 1000) / 1000.0f, + std::round(-s * 1000) / 1000.0f, + std::round(tx * 1000) / 1000.0f, + std::round( s * 1000) / 1000.0f, + std::round( c * 1000) / 1000.0f, + std::round(ty * 1000) / 1000.0f, + }; + + switch(method) { + case AF_INTERP_NEAREST: + t_fn = &transform_n; + break; + case AF_INTERP_BILINEAR: + t_fn = &transform_b; + break; + case AF_INTERP_LOWER: + t_fn = &transform_l; + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } + + + // Do transform for image + for(int yy = 0; yy < (int)odims[1]; yy++) { + for(int xx = 0; xx < (int)odims[0]; xx++) { + t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy, false); + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/scan.hpp b/src/backend/cpu/kernel/scan.hpp new file mode 100644 index 0000000000..0bcfe7df17 --- /dev/null +++ b/src/backend/cpu/kernel/scan.hpp @@ -0,0 +1,72 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +struct scan_dim +{ + void operator()(Array out, dim_t outOffset, + const Array in, dim_t inOffset, + const int dim) const + { + const dim4 odims = out.dims(); + const dim4 ostrides = out.strides(); + const dim4 istrides = in.strides(); + + const int D1 = D - 1; + for (dim_t i = 0; i < odims[D1]; i++) { + scan_dim func; + getQueue().enqueue(func, + out, outOffset + i * ostrides[D1], + in, inOffset + i * istrides[D1], dim); + if (D1 == dim) break; + } + } +}; + +template +struct scan_dim +{ + void operator()(Array output, dim_t outOffset, + const Array input, dim_t inOffset, + const int dim) const + { + const Ti* in = input.get() + inOffset; + To* out= output.get()+ outOffset; + + const dim4 ostrides = output.strides(); + const dim4 istrides = input.strides(); + const dim4 idims = input.dims(); + + dim_t istride = istrides[dim]; + dim_t ostride = ostrides[dim]; + + Transform transform; + // FIXME: Change the name to something better + Binary scan; + + To out_val = scan.init(); + for (dim_t i = 0; i < idims[dim]; i++) { + To in_val = transform(in[i * istride]); + out_val = scan(in_val, out_val); + out[i * ostride] = out_val; + } + } +}; + +} +} diff --git a/src/backend/cpu/kernel/select.hpp b/src/backend/cpu/kernel/select.hpp new file mode 100644 index 0000000000..1099c7e437 --- /dev/null +++ b/src/backend/cpu/kernel/select.hpp @@ -0,0 +1,124 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void select(Array out, const Array cond, const Array a, const Array b) +{ + af::dim4 adims = a.dims(); + af::dim4 astrides = a.strides(); + af::dim4 bdims = b.dims(); + af::dim4 bstrides = b.strides(); + + af::dim4 cdims = cond.dims(); + af::dim4 cstrides = cond.strides(); + + af::dim4 odims = out.dims(); + af::dim4 ostrides = out.strides(); + + bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1], + adims[2] == odims[2], adims[3] == odims[3]}; + + bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1], + bdims[2] == odims[2], bdims[3] == odims[3]}; + + bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1], + cdims[2] == odims[2], cdims[3] == odims[3]}; + + const T *aptr = a.get(); + const T *bptr = b.get(); + T *optr = out.get(); + const char *cptr = cond.get(); + + for (int l = 0; l < odims[3]; l++) { + + int o_off3 = ostrides[3] * l; + int a_off3 = astrides[3] * is_a_same[3] * l; + int b_off3 = bstrides[3] * is_b_same[3] * l; + int c_off3 = cstrides[3] * is_c_same[3] * l; + + for (int k = 0; k < odims[2]; k++) { + + int o_off2 = ostrides[2] * k + o_off3; + int a_off2 = astrides[2] * is_a_same[2] * k + a_off3; + int b_off2 = bstrides[2] * is_b_same[2] * k + b_off3; + int c_off2 = cstrides[2] * is_c_same[2] * k + c_off3; + + for (int j = 0; j < odims[1]; j++) { + + int o_off1 = ostrides[1] * j + o_off2; + int a_off1 = astrides[1] * is_a_same[1] * j + a_off2; + int b_off1 = bstrides[1] * is_b_same[1] * j + b_off2; + int c_off1 = cstrides[1] * is_c_same[1] * j + c_off2; + + for (int i = 0; i < odims[0]; i++) { + + bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1]; + T aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1]; + T bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1]; + T oval = cval ? aval : bval; + optr[o_off1 + i] = oval; + } + } + } + } +} + +template +void select_scalar(Array out, const Array cond, const Array a, const double b) +{ + af::dim4 astrides = a.strides(); + af::dim4 cstrides = cond.strides(); + + af::dim4 odims = out.dims(); + af::dim4 ostrides = out.strides(); + + const T *aptr = a.get(); + T *optr = out.get(); + const char *cptr = cond.get(); + + for (int l = 0; l < odims[3]; l++) { + + int o_off3 = ostrides[3] * l; + int a_off3 = astrides[3] * l; + int c_off3 = cstrides[3] * l; + + for (int k = 0; k < odims[2]; k++) { + + int o_off2 = ostrides[2] * k + o_off3; + int a_off2 = astrides[2] * k + a_off3; + int c_off2 = cstrides[2] * k + c_off3; + + for (int j = 0; j < odims[1]; j++) { + + int o_off1 = ostrides[1] * j + o_off2; + int a_off1 = astrides[1] * j + a_off2; + int c_off1 = cstrides[1] * j + c_off2; + + for (int i = 0; i < odims[0]; i++) { + + optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b; + } + } + } + } +} + + + +} +} diff --git a/src/backend/cpu/kernel/shift.hpp b/src/backend/cpu/kernel/shift.hpp new file mode 100644 index 0000000000..8beb975486 --- /dev/null +++ b/src/backend/cpu/kernel/shift.hpp @@ -0,0 +1,69 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +static inline dim_t simple_mod(const dim_t i, const dim_t dim) +{ + return (i < dim) ? i : (i - dim); +} + +template +void shift(Array out, const Array in, const af::dim4 sdims) +{ + T* outPtr = out.get(); + const T* inPtr = in.get(); + + const af::dim4 oDims = out.dims(); + const af::dim4 ist = in.strides(); + const af::dim4 ost = out.strides(); + + int sdims_[4]; + // Need to do this because we are mapping output to input in the kernel + for(int i = 0; i < 4; i++) { + // sdims_[i] will always be positive and always [0, oDims[i]]. + // Negative shifts are converted to position by going the other way round + sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0); + assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]); + } + + for(dim_t ow = 0; ow < oDims[3]; ow++) { + const int oW = ow * ost[3]; + const int iw = simple_mod((ow + sdims_[3]), oDims[3]); + const int iW = iw * ist[3]; + for(dim_t oz = 0; oz < oDims[2]; oz++) { + const int oZW = oW + oz * ost[2]; + const int iz = simple_mod((oz + sdims_[2]), oDims[2]); + const int iZW = iW + iz * ist[2]; + for(dim_t oy = 0; oy < oDims[1]; oy++) { + const int oYZW = oZW + oy * ost[1]; + const int iy = simple_mod((oy + sdims_[1]), oDims[1]); + const int iYZW = iZW + iy * ist[1]; + for(dim_t ox = 0; ox < oDims[0]; ox++) { + const int oIdx = oYZW + ox; + const int ix = simple_mod((ox + sdims_[0]), oDims[0]); + const int iIdx = iYZW + ix; + + outPtr[oIdx] = inPtr[iIdx]; + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/sift_nonfree.hpp b/src/backend/cpu/kernel/sift_nonfree.hpp new file mode 100644 index 0000000000..e7ca19175c --- /dev/null +++ b/src/backend/cpu/kernel/sift_nonfree.hpp @@ -0,0 +1,1196 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +// The source code contained in this file is based on the original code by +// Rob Hess. Please note that SIFT is an algorithm patented and protected +// by US law, before using this code or any binary forms generated from it, +// verify that you have permission to do so. The original license by Rob Hess +// can be read below: +// +// Copyright (c) 2006-2012, Rob Hess +// All rights reserved. +// +// The following patent has been issued for methods embodied in this +// software: "Method and apparatus for identifying scale invariant features +// in an image and use of same for locating an object in an image," David +// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application +// filed March 8, 1999. Asignee: The University of British Columbia. For +// further details, contact David Lowe (lowe@cs.ubc.ca) or the +// University-Industry Liaison Office of the University of British +// Columbia. +// +// Note that restrictions imposed by this patent (and possibly others) +// exist independently of and may be in conflict with the freedoms granted +// in this license, which refers to copyright of the program, not patents +// for any methods that it implements. Both copyright and patent law must +// be obeyed to legally use and redistribute this program and it is not the +// purpose of this license to induce you to infringe any patents or other +// property right claims or to contest validity of any such claims. If you +// redistribute or use the program, then this license merely protects you +// from committing copyright infringement. It does not protect you from +// committing patent infringement. So, before you do anything with this +// program, make sure that you have permission to do so not merely in terms +// of copyright, but also in terms of patent law. +// +// Please note that this license is not to be understood as a guarantee +// either. If you use the program according to this license, but in +// conflict with patent law, it does not mean that the licensor will refund +// you for any losses that you incur if you are sued for your patent +// infringement. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// * Redistributions of source code must retain the above copyright and +// patent notices, this list of conditions and the following +// disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Oregon State University nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +using af::dim4; + +namespace cpu +{ + +static const float PI_VAL = 3.14159265358979323846f; + +// default width of descriptor histogram array +static const int DescrWidth = 4; + +// default number of bins per histogram in descriptor array +static const int DescrHistBins = 8; + +// assumed gaussian blur for input image +static const float InitSigma = 0.5f; + +// width of border in which to ignore keypoints +static const int ImgBorder = 5; + +// maximum steps of keypoint interpolation before failure +static const int MaxInterpSteps = 5; + +// default number of bins in histogram for orientation assignment +static const int OriHistBins = 36; + +// determines gaussian sigma for orientation assignment +static const float OriSigFctr = 1.5f; + +// determines the radius of the region used in orientation assignment */ +static const float OriRadius = 3.0f * OriSigFctr; + +// number of passes of orientation histogram smoothing +static const int SmoothOriPasses = 2; + +// orientation magnitude relative to max that results in new feature +static const float OriPeakRatio = 0.8f; + +// determines the size of a single descriptor orientation histogram +static const float DescrSclFctr = 3.f; + +// threshold on magnitude of elements of descriptor vector +static const float DescrMagThr = 0.2f; + +// factor used to convert floating-point descriptor to unsigned char +static const float IntDescrFctr = 512.f; + +// Number of GLOH bins in radial direction +static const unsigned GLOHRadialBins = 3; + +// Radiuses of GLOH descriptors +static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f}; + +// Number of GLOH angular bins (excluding the inner-most radial section) +static const unsigned GLOHAngularBins = 8; + +// Number of GLOH bins per histogram in descriptor +static const unsigned GLOHHistBins = 16; + +typedef struct +{ + float f[4]; + unsigned l; +} feat_t; + +bool feat_cmp(feat_t i, feat_t j) +{ + for (int k = 0; k < 4; k++) + if (i.f[k] != j.f[k]) + return (i.f[k] < j.f[k]); + if (i.l != j.l) + return (i.l < j.l); + + return true; +} + +void array_to_feat(std::vector& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat) +{ + feat.resize(nfeat); + for (unsigned i = 0; i < feat.size(); i++) { + feat[i].f[0] = x[i]; + feat[i].f[1] = y[i]; + feat[i].f[2] = resp[i]; + feat[i].f[3] = size[i]; + feat[i].l = layer[i]; + } +} + +template +void gaussian1D(T* out, const int dim, double sigma=0.0) +{ + if(!(sigma>0)) sigma = 0.25*dim; + + T sum = (T)0; + for(int i=0;i +Array gauss_filter(float sigma) +{ + // Using 6-sigma rule + unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u); + + Array filter = createEmptyArray(gauss_len); + gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma); + + return filter; +} + +template +void gaussianElimination(float* A, float* b, float* x) +{ + // forward elimination + for (int i = 0; i < N-1; i++) { + for (int j = i+1; j < N; j++) { + float s = A[j*N+i] / A[i*N+i]; + + for (int k = i; k < N; k++) + A[j*N+k] -= s * A[i*N+k]; + + b[j] -= s * b[i]; + } + } + + for (int i = 0; i < N; i++) + x[i] = 0; + + // backward substitution + float sum = 0; + for (int i = 0; i <= N-2; i++) { + sum = b[i]; + for (int j = i+1; j < N; j++) + sum -= A[i*N+j] * x[j]; + x[i] = sum / A[i*N+i]; + } +} + +template +void sub( + Array& out, + const Array& in1, + const Array& in2) +{ + size_t nel = in1.elements(); + T* out_ptr = out.get(); + const T* in1_ptr = in1.get(); + const T* in2_ptr = in2.get(); + + for (size_t i = 0; i < nel; i++) { + out_ptr[i] = in1_ptr[i] - in2_ptr[i]; + } +} + +#define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)]) +#define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)]) +#define NPTR(Y, X) (next_ptr[(Y) * idims[0] + (X)]) + +// Determines whether a pixel is a scale-space extremum by comparing it to its +// 3x3x3 pixel neighborhood. +template +void detectExtrema( + float* x_out, + float* y_out, + unsigned* layer_out, + unsigned* counter, + const Array& prev, + const Array& center, + const Array& next, + const unsigned layer, + const unsigned max_feat, + const float threshold) +{ + const af::dim4 idims = center.dims(); + const T* prev_ptr = prev.get(); + const T* center_ptr = center.get(); + const T* next_ptr = next.get(); + + for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) { + for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) { + float p = center_ptr[y*idims[0] + x]; + + // Find extrema + if (abs((float)p) > threshold && + ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) && + p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y, x+1) && + p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1) && + p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1) && + p > PPTR(y, x-1) && p > PPTR(y , x) && p > PPTR(y, x+1) && + p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1) && + p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1) && + p > NPTR(y, x-1) && p > NPTR(y , x) && p > NPTR(y, x+1) && + p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) || + (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) && + p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y, x+1) && + p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1) && + p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1) && + p < PPTR(y, x-1) && p < PPTR(y , x) && p < PPTR(y, x+1) && + p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1) && + p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1) && + p < NPTR(y, x-1) && p < NPTR(y , x) && p < NPTR(y, x+1) && + p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) { + + if (*counter < max_feat) + { + x_out[*counter] = (float)y; + y_out[*counter] = (float)x; + layer_out[*counter] = layer; + (*counter)++; + } + } + } + } +} + +// Interpolates a scale-space extremum's location and scale to subpixel +// accuracy to form an image feature. Rejects features with low contrast. +// Based on Section 4 of Lowe's paper. +template +void interpolateExtrema( + float* x_out, + float* y_out, + unsigned* layer_out, + float* response_out, + float* size_out, + unsigned* counter, + const float* x_in, + const float* y_in, + const unsigned* layer_in, + const unsigned extrema_feat, + std::vector< Array >& dog_pyr, + const unsigned max_feat, + const unsigned octave, + const unsigned n_layers, + const float contrast_thr, + const float edge_thr, + const float sigma, + const float img_scale) +{ + for (int f = 0; f < (int)extrema_feat; f++) { + const float first_deriv_scale = img_scale*0.5f; + const float second_deriv_scale = img_scale; + const float cross_deriv_scale = img_scale*0.25f; + + float xl = 0, xy = 0, xx = 0, contr = 0; + int i = 0; + + unsigned x = x_in[f]; + unsigned y = y_in[f]; + unsigned layer = layer_in[f]; + + const T* prev_ptr = dog_pyr[octave*(n_layers+2) + layer-1].get(); + const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get(); + const T* next_ptr = dog_pyr[octave*(n_layers+2) + layer+1].get(); + + af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims(); + + bool converges = true; + + for (i = 0; i < MaxInterpSteps; i++) { + float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale, + (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale, + (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale}; + + float d2 = CPTR(x, y) * 2.f; + float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale; + float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale; + float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale; + float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) - + CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale; + float dxs = (NPTR(x+1, y) - NPTR(x-1, y) - + PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale; + float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) - + PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale; + + float H[9] = {dxx, dxy, dxs, + dxy, dyy, dys, + dxs, dys, dss}; + + float X[3]; + gaussianElimination<3>(H, dD, X); + + xl = -X[2]; + xy = -X[1]; + xx = -X[0]; + + if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f) + break; + + x += round(xx); + y += round(xy); + layer += round(xl); + + if (layer < 1 || layer > n_layers || + x < ImgBorder || x >= idims[1] - ImgBorder || + y < ImgBorder || y >= idims[0] - ImgBorder) { + converges = false; + break; + } + } + + // ensure convergence of interpolation + if (i >= MaxInterpSteps || !converges) + continue; + + float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale, + (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale, + (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale}; + float X[3] = {xx, xy, xl}; + + float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2]; + + contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f; + if(abs(contr) < (contrast_thr / n_layers)) + continue; + + // principal curvatures are computed using the trace and det of Hessian + float d2 = CPTR(x, y) * 2.f; + float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale; + float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale; + float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) - + CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale; + + float tr = dxx + dyy; + float det = dxx * dyy - dxy * dxy; + + // add FLT_EPSILON for double-precision compatibility + if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON) + continue; + + if (*counter < max_feat) + { + x_out[*counter] = (x + xx) * (1 << octave); + y_out[*counter] = (y + xy) * (1 << octave); + layer_out[*counter] = layer; + response_out[*counter] = abs(contr); + size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f; + (*counter)++; + } + } +} + +#undef CPTR +#undef PPTR +#undef NPTR + +// Remove duplicate keypoints +void removeDuplicates( + float* x_out, + float* y_out, + unsigned* layer_out, + float* response_out, + float* size_out, + unsigned* counter, + const std::vector& sorted_feat) +{ + size_t nfeat = sorted_feat.size(); + + for (size_t f = 0; f < nfeat; f++) { + float prec_fctr = 1e4f; + + if (f < nfeat-1) { + if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) && + round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) && + round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) && + round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) && + sorted_feat[f].l == sorted_feat[f+1].l) + continue; + } + + x_out[*counter] = sorted_feat[f].f[0]; + y_out[*counter] = sorted_feat[f].f[1]; + response_out[*counter] = sorted_feat[f].f[2]; + size_out[*counter] = sorted_feat[f].f[3]; + layer_out[*counter] = sorted_feat[f].l; + (*counter)++; + } +} + +#define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)]) + +// Computes a canonical orientation for each image feature in an array. Based +// on Section 5 of Lowe's paper. This function adds features to the array when +// there is more than one dominant orientation at a given feature location. +template +void calcOrientation( + float* x_out, + float* y_out, + unsigned* layer_out, + float* response_out, + float* size_out, + float* ori_out, + unsigned* counter, + const float* x_in, + const float* y_in, + const unsigned* layer_in, + const float* response_in, + const float* size_in, + const unsigned total_feat, + const std::vector< Array >& gauss_pyr, + const unsigned max_feat, + const unsigned octave, + const unsigned n_layers, + const bool double_input) +{ + const int n = OriHistBins; + + float hist[OriHistBins]; + float temphist[OriHistBins]; + + for (unsigned f = 0; f < total_feat; f++) { + // Load keypoint information + const float real_x = x_in[f]; + const float real_y = y_in[f]; + const unsigned layer = layer_in[f]; + const float response = response_in[f]; + const float size = size_in[f]; + + const int pt_x = (int)round(real_x / (1 << octave)); + const int pt_y = (int)round(real_y / (1 << octave)); + + // Calculate auxiliary parameters + const float scl_octv = size*0.5f / (1 << octave); + const int radius = (int)round(OriRadius * scl_octv); + const float sigma = OriSigFctr * scl_octv; + const int len = (radius*2+1); + const float exp_denom = 2.f * sigma * sigma; + + // Points img to correct Gaussian pyramid layer + const Array img = gauss_pyr[octave*(n_layers+3) + layer]; + const T* img_ptr = img.get(); + + for (int i = 0; i < OriHistBins; i++) + hist[i] = 0.f; + + af::dim4 idims = img.dims(); + + // Calculate orientation histogram + for (int l = 0; l < len*len; l++) { + int i = l / len - radius; + int j = l % len - radius; + + int y = pt_y + i; + int x = pt_x + j; + if (y < 1 || y >= idims[0] - 1 || + x < 1 || x >= idims[1] - 1) + continue; + + float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); + float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); + + float mag = sqrt(dx*dx+dy*dy); + float ori = atan2(dy,dx); + float w = exp(-(i*i + j*j)/exp_denom); + + int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL)); + bin = bin < n ? bin : 0; + + hist[bin] += w*mag; + } + + for (int i = 0; i < SmoothOriPasses; i++) { + for (int j = 0; j < n; j++) { + temphist[j] = hist[j]; + } + for (int j = 0; j < n; j++) { + float prev = (j == 0) ? temphist[n-1] : temphist[j-1]; + float next = (j+1 == n) ? temphist[0] : temphist[j+1]; + hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next; + } + } + + float omax = hist[0]; + for (int i = 1; i < n; i++) + omax = max(omax, hist[i]); + + float mag_thr = (float)(omax * OriPeakRatio); + int l, r; + for (int j = 0; j < n; j++) { + l = (j == 0) ? n - 1 : j - 1; + r = (j + 1) % n; + if (hist[j] > hist[l] && + hist[j] > hist[r] && + hist[j] >= mag_thr) { + if (*counter < max_feat) { + float bin = j + 0.5f * (hist[l] - hist[r]) / + (hist[l] - 2.0f*hist[j] + hist[r]); + bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin; + float ori = 360.f - ((360.f/n) * bin); + + float new_real_x = real_x; + float new_real_y = real_y; + float new_size = size; + + if (double_input) { + float scale = 0.5f; + new_real_x *= scale; + new_real_y *= scale; + new_size *= scale; + } + + x_out[*counter] = new_real_x; + y_out[*counter] = new_real_y; + layer_out[*counter] = layer; + response_out[*counter] = response; + size_out[*counter] = new_size; + ori_out[*counter] = ori; + (*counter)++; + } + } + } + } +} + +void normalizeDesc( + float* desc, + const int histlen) +{ + float len_sq = 0.0f; + + for (int i = 0; i < histlen; i++) + len_sq += desc[i] * desc[i]; + + float len_inv = 1.0f / sqrt(len_sq); + + for (int i = 0; i < histlen; i++) { + desc[i] *= len_inv; + } +} + +// Computes feature descriptors for features in an array. Based on Section 6 +// of Lowe's paper. +template +void computeDescriptor( + float* desc_out, + const unsigned desc_len, + const float* x_in, + const float* y_in, + const unsigned* layer_in, + const float* response_in, + const float* size_in, + const float* ori_in, + const unsigned total_feat, + const std::vector< Array >& gauss_pyr, + const int d, + const int n, + const float scale, + const unsigned octave, + const unsigned n_layers) +{ + float desc[128]; + + for (unsigned f = 0; f < total_feat; f++) { + const unsigned layer = layer_in[f]; + float ori = (360.f - ori_in[f]) * PI_VAL / 180.f; + ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori; + const float size = size_in[f]; + const int fx = round(x_in[f] * scale); + const int fy = round(y_in[f] * scale); + + // Points img to correct Gaussian pyramid layer + Array img = gauss_pyr[octave*(n_layers+3) + layer]; + const T* img_ptr = img.get(); + af::dim4 idims = img.dims(); + + float cos_t = cos(ori); + float sin_t = sin(ori); + float bins_per_rad = n / (PI_VAL * 2.f); + float exp_denom = d * d * 0.5f; + float hist_width = DescrSclFctr * size * scale * 0.5f; + int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f; + + int len = radius*2+1; + + for (int i = 0; i < (int)desc_len; i++) + desc[i] = 0.f; + + // Calculate orientation histogram + for (int l = 0; l < len*len; l++) { + int i = l / len - radius; + int j = l % len - radius; + + int y = fy + i; + int x = fx + j; + + float x_rot = (j * cos_t - i * sin_t) / hist_width; + float y_rot = (j * sin_t + i * cos_t) / hist_width; + float xbin = x_rot + d/2 - 0.5f; + float ybin = y_rot + d/2 - 0.5f; + + if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d && + y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) { + float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); + float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); + + float grad_mag = sqrt(dx*dx + dy*dy); + float grad_ori = atan2(dy, dx) - ori; + while (grad_ori < 0.0f) + grad_ori += PI_VAL*2; + while (grad_ori >= PI_VAL*2) + grad_ori -= PI_VAL*2; + + float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom); + float obin = grad_ori * bins_per_rad; + float mag = grad_mag*w; + + int x0 = floor(xbin); + int y0 = floor(ybin); + int o0 = floor(obin); + xbin -= x0; + ybin -= y0; + obin -= o0; + + for (int yl = 0; yl <= 1; yl++) { + int yb = y0 + yl; + if (yb >= 0 && yb < d) { + float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin); + for (int xl = 0; xl <= 1; xl++) { + int xb = x0 + xl; + if (xb >= 0 && xb < d) { + float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin); + for (int ol = 0; ol <= 1; ol++) { + int ob = (o0 + ol) % n; + float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin); + desc[(yb*d + xb)*n + ob] += v_o; + } + } + } + } + } + } + } + + normalizeDesc(desc, desc_len); + + for (int i = 0; i < (int)desc_len; i++) + desc[i] = min(desc[i], DescrMagThr); + + normalizeDesc(desc, desc_len); + + // Calculate final descriptor values + for (int k = 0; k < (int)desc_len; k++) { + desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr)); + } + } +} + +// Computes GLOH feature descriptors for features in an array. Based on Section III-B +// of Mikolajczyk and Schmid paper. +template +void computeGLOHDescriptor( + float* desc_out, + const unsigned desc_len, + const float* x_in, + const float* y_in, + const unsigned* layer_in, + const float* response_in, + const float* size_in, + const float* ori_in, + const unsigned total_feat, + const std::vector< Array >& gauss_pyr, + const int d, + const unsigned rb, + const unsigned ab, + const unsigned hb, + const float scale, + const unsigned octave, + const unsigned n_layers) +{ + float desc[272]; + + for (unsigned f = 0; f < total_feat; f++) { + const unsigned layer = layer_in[f]; + float ori = (360.f - ori_in[f]) * PI_VAL / 180.f; + ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori; + const float size = size_in[f]; + const int fx = round(x_in[f] * scale); + const int fy = round(y_in[f] * scale); + + // Points img to correct Gaussian pyramid layer + Array img = gauss_pyr[octave*(n_layers+3) + layer]; + const T* img_ptr = img.get(); + af::dim4 idims = img.dims(); + + float cos_t = cos(ori); + float sin_t = sin(ori); + float hist_bins_per_rad = hb / (PI_VAL * 2.f); + float polar_bins_per_rad = ab / (PI_VAL * 2.f); + float exp_denom = GLOHRadii[rb-1] * 0.5f; + + float hist_width = DescrSclFctr * size * scale * 0.5f; + + // Keep same descriptor radius used for SIFT + int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f; + + // Alternative radius size calculation, changing the radius weight + // (rw) in the range of 0.25f-0.75f gives different results, + // increasing it tends to show a better recall rate but with a + // smaller amount of correct matches + //float rw = 0.5f; + //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f; + + int len = radius*2+1; + + for (int i = 0; i < (int)desc_len; i++) + desc[i] = 0.f; + + // Calculate orientation histogram + for (int l = 0; l < len*len; l++) { + int i = l / len - radius; + int j = l % len - radius; + + int y = fy + i; + int x = fx + j; + + float x_rot = (j * cos_t - i * sin_t); + float y_rot = (j * sin_t + i * cos_t); + + float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1]; + float theta = atan2(y_rot, x_rot); + while (theta < 0.0f) + theta += PI_VAL*2; + while (theta >= PI_VAL*2) + theta -= PI_VAL*2; + + float tbin = theta * polar_bins_per_rad; + float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] : + ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) : + min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON)); + + if (r <= GLOHRadii[rb-1] && + y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) { + float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); + float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); + + float grad_mag = sqrt(dx*dx + dy*dy); + float grad_ori = atan2(dy, dx) - ori; + while (grad_ori < 0.0f) + grad_ori += PI_VAL*2; + while (grad_ori >= PI_VAL*2) + grad_ori -= PI_VAL*2; + + float w = exp(-r / exp_denom); + float obin = grad_ori * hist_bins_per_rad; + float mag = grad_mag*w; + + int t0 = floor(tbin); + int r0 = floor(rbin); + int o0 = floor(obin); + tbin -= t0; + rbin -= r0; + obin -= o0; + + for (int rl = 0; rl <= 1; rl++) { + int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl); + float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin); + if (rb >= 0 && rb <= 2) { + for (int tl = 0; tl <= 1; tl++) { + int tb = (t0 + tl) % ab; + float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin); + for (int ol = 0; ol <= 1; ol++) { + int ob = (o0 + ol) % hb; + float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin); + unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob; + desc[idx] += v_o; + } + } + } + } + } + } + + normalizeDesc(desc, desc_len); + + for (int i = 0; i < (int)desc_len; i++) + desc[i] = min(desc[i], DescrMagThr); + + normalizeDesc(desc, desc_len); + + // Calculate final descriptor values + for (int k = 0; k < (int)desc_len; k++) { + desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr)); + } + } +} + +#undef IPTR + +template +Array createInitialImage( + const Array& img, + const float init_sigma, + const bool double_input) +{ + af::dim4 idims = img.dims(); + + Array init_img = createEmptyArray(af::dim4()); + + float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f) + : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f); + + Array filter = gauss_filter(s); + + if (double_input) { + Array double_img = resize(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR); + init_img = convolve2(double_img, filter, filter); + } + else { + init_img = convolve2(img, filter, filter); + } + + return init_img; +} + +template +std::vector< Array > buildGaussPyr( + const Array& init_img, + const unsigned n_octaves, + const unsigned n_layers, + const float init_sigma) +{ + // Precompute Gaussian sigmas using the following formula: + // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2 + std::vector sig_layers(n_layers + 3); + sig_layers[0] = init_sigma; + float k = std::pow(2.0f, 1.0f / n_layers); + for (unsigned i = 1; i < n_layers + 3; i++) { + float sig_prev = std::pow(k, i-1) * init_sigma; + float sig_total = sig_prev * k; + sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev); + } + + // Gaussian Pyramid + std::vector< Array > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray(af::dim4())); + for (unsigned o = 0; o < n_octaves; o++) { + for (unsigned l = 0; l < n_layers+3; l++) { + unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1; + unsigned idx = o*(n_layers+3) + l; + + if (o == 0 && l == 0) { + gauss_pyr[idx] = init_img; + } + else if (l == 0) { + af::dim4 sdims = gauss_pyr[src_idx].dims(); + gauss_pyr[idx] = resize(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR); + } + else { + Array filter = gauss_filter(sig_layers[l]); + + gauss_pyr[idx] = convolve2(gauss_pyr[src_idx], filter, filter); + } + } + } + + return gauss_pyr; +} + +template +std::vector< Array > buildDoGPyr( + std::vector< Array >& gauss_pyr, + const unsigned n_octaves, + const unsigned n_layers) +{ + // DoG Pyramid + std::vector< Array > dog_pyr(n_octaves * (n_layers+2), createEmptyArray(af::dim4())); + for (unsigned o = 0; o < n_octaves; o++) { + for (unsigned l = 0; l < n_layers+2; l++) { + unsigned idx = o*(n_layers+2) + l; + unsigned bottom = o*(n_layers+3) + l; + unsigned top = o*(n_layers+3) + l+1; + + dog_pyr[idx] = createEmptyArray(gauss_pyr[bottom].dims()); + + sub(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]); + } + } + + return dog_pyr; +} + + +template +unsigned sift_impl(Array& x, Array& y, Array& score, + Array& ori, Array& size, Array& desc, + const Array& in, const unsigned n_layers, + const float contrast_thr, const float edge_thr, + const float init_sigma, const bool double_input, + const float img_scale, const float feature_ratio, + const bool compute_GLOH) +{ + in.eval(); + getQueue().sync(); + af::dim4 idims = in.dims(); + + const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2) + : min(idims[0], idims[1]); + const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2; + + Array init_img = createInitialImage(in, init_sigma, double_input); + + std::vector< Array > gauss_pyr = buildGaussPyr(init_img, n_octaves, n_layers, init_sigma); + + std::vector< Array > dog_pyr = buildDoGPyr(gauss_pyr, n_octaves, n_layers); + + std::vector x_pyr(n_octaves, NULL); + std::vector y_pyr(n_octaves, NULL); + std::vector response_pyr(n_octaves, NULL); + std::vector size_pyr(n_octaves, NULL); + std::vector ori_pyr(n_octaves, NULL); + std::vector desc_pyr(n_octaves, NULL); + std::vector feat_pyr(n_octaves, 0); + unsigned total_feat = 0; + + const unsigned d = DescrWidth; + const unsigned n = DescrHistBins; + const unsigned rb = GLOHRadialBins; + const unsigned ab = GLOHAngularBins; + const unsigned hb = GLOHHistBins; + const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n; + + for (unsigned i = 0; i < n_octaves; i++) { + af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims(); + if (ddims[0]-2*ImgBorder < 1 || + ddims[1]-2*ImgBorder < 1) + continue; + + const unsigned imel = ddims[0] * ddims[1]; + const unsigned max_feat = ceil(imel * feature_ratio); + + float* extrema_x = memAlloc(max_feat); + float* extrema_y = memAlloc(max_feat); + unsigned* extrema_layer = memAlloc(max_feat); + unsigned extrema_feat = 0; + + for (unsigned j = 1; j <= n_layers; j++) { + unsigned prev = i*(n_layers+2) + j-1; + unsigned center = i*(n_layers+2) + j; + unsigned next = i*(n_layers+2) + j+1; + + unsigned layer = j; + + float extrema_thr = 0.5f * contrast_thr / n_layers; + detectExtrema(extrema_x, extrema_y, extrema_layer, &extrema_feat, + dog_pyr[prev], dog_pyr[center], dog_pyr[next], + layer, max_feat, extrema_thr); + } + + extrema_feat = min(extrema_feat, max_feat); + + if (extrema_feat == 0) { + memFree(extrema_x); + memFree(extrema_y); + memFree(extrema_layer); + + continue; + } + + unsigned interp_feat = 0; + + float* interp_x = memAlloc(extrema_feat); + float* interp_y = memAlloc(extrema_feat); + unsigned* interp_layer = memAlloc(extrema_feat); + float* interp_response = memAlloc(extrema_feat); + float* interp_size = memAlloc(extrema_feat); + + interpolateExtrema(interp_x, interp_y, interp_layer, + interp_response, interp_size, &interp_feat, + extrema_x, extrema_y, extrema_layer, extrema_feat, + dog_pyr, max_feat, i, n_layers, + contrast_thr, edge_thr, init_sigma, img_scale); + + interp_feat = min(interp_feat, max_feat); + + if (interp_feat == 0) { + memFree(interp_x); + memFree(interp_y); + memFree(interp_layer); + memFree(interp_response); + memFree(interp_size); + + continue; + } + + std::vector sorted_feat; + array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat); + std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp); + + memFree(interp_x); + memFree(interp_y); + memFree(interp_layer); + memFree(interp_response); + memFree(interp_size); + + unsigned nodup_feat = 0; + + float* nodup_x = memAlloc(interp_feat); + float* nodup_y = memAlloc(interp_feat); + unsigned* nodup_layer = memAlloc(interp_feat); + float* nodup_response = memAlloc(interp_feat); + float* nodup_size = memAlloc(interp_feat); + + removeDuplicates(nodup_x, nodup_y, nodup_layer, + nodup_response, nodup_size, &nodup_feat, + sorted_feat); + + const unsigned max_oriented_feat = nodup_feat * 3; + + float* oriented_x = memAlloc(max_oriented_feat); + float* oriented_y = memAlloc(max_oriented_feat); + unsigned* oriented_layer = memAlloc(max_oriented_feat); + float* oriented_response = memAlloc(max_oriented_feat); + float* oriented_size = memAlloc(max_oriented_feat); + float* oriented_ori = memAlloc(max_oriented_feat); + + unsigned oriented_feat = 0; + + calcOrientation(oriented_x, oriented_y, oriented_layer, + oriented_response, oriented_size, oriented_ori, &oriented_feat, + nodup_x, nodup_y, nodup_layer, + nodup_response, nodup_size, nodup_feat, + gauss_pyr, max_oriented_feat, i, n_layers, double_input); + + memFree(nodup_x); + memFree(nodup_y); + memFree(nodup_layer); + memFree(nodup_response); + memFree(nodup_size); + + if (oriented_feat == 0) { + memFree(oriented_x); + memFree(oriented_y); + memFree(oriented_layer); + memFree(oriented_response); + memFree(oriented_size); + memFree(oriented_ori); + + continue; + } + + float* desc = memAlloc(oriented_feat * desc_len); + + float scale = 1.f/(1 << i); + if (double_input) scale *= 2.f; + + if (compute_GLOH) + computeGLOHDescriptor(desc, desc_len, + oriented_x, oriented_y, oriented_layer, + oriented_response, oriented_size, oriented_ori, + oriented_feat, gauss_pyr, d, rb, ab, hb, + scale, i, n_layers); + else + computeDescriptor(desc, desc_len, + oriented_x, oriented_y, oriented_layer, + oriented_response, oriented_size, oriented_ori, + oriented_feat, gauss_pyr, d, n, scale, i, n_layers); + + total_feat += oriented_feat; + feat_pyr[i] = oriented_feat; + + if (oriented_feat > 0) { + x_pyr[i] = oriented_x; + y_pyr[i] = oriented_y; + response_pyr[i] = oriented_response; + ori_pyr[i] = oriented_ori; + size_pyr[i] = oriented_size; + desc_pyr[i] = desc; + } + } + + if (total_feat > 0) { + const af::dim4 total_feat_dims(total_feat); + const af::dim4 desc_dims(desc_len, total_feat); + + // Allocate output memory + x = createEmptyArray(total_feat_dims); + y = createEmptyArray(total_feat_dims); + score = createEmptyArray(total_feat_dims); + ori = createEmptyArray(total_feat_dims); + size = createEmptyArray(total_feat_dims); + desc = createEmptyArray(desc_dims); + + float* x_ptr = x.get(); + float* y_ptr = y.get(); + float* score_ptr = score.get(); + float* ori_ptr = ori.get(); + float* size_ptr = size.get(); + float* desc_ptr = desc.get(); + + unsigned offset = 0; + for (unsigned i = 0; i < n_octaves; i++) { + if (feat_pyr[i] == 0) + continue; + + memcpy(x_ptr+offset, x_pyr[i], feat_pyr[i] * sizeof(float)); + memcpy(y_ptr+offset, y_pyr[i], feat_pyr[i] * sizeof(float)); + memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float)); + memcpy(ori_ptr+offset, ori_pyr[i], feat_pyr[i] * sizeof(float)); + memcpy(size_ptr+offset, size_pyr[i], feat_pyr[i] * sizeof(float)); + + memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float)); + + memFree(x_pyr[i]); + memFree(y_pyr[i]); + memFree(response_pyr[i]); + memFree(ori_pyr[i]); + memFree(size_pyr[i]); + memFree(desc_pyr[i]); + + offset += feat_pyr[i]; + } + } + + return total_feat; +} + +} diff --git a/src/backend/cpu/kernel/sobel.hpp b/src/backend/cpu/kernel/sobel.hpp new file mode 100644 index 0000000000..49d33cdbb4 --- /dev/null +++ b/src/backend/cpu/kernel/sobel.hpp @@ -0,0 +1,86 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void derivative(Array output, const Array input) +{ + const af::dim4 dims = input.dims(); + const af::dim4 strides = input.strides(); + To* optr = output.get(); + const Ti* iptr = input.get(); + + for(dim_t b3=0; b3=0 && _joff>=0) ? + iptr[_joff*strides[1]+_ioff*strides[0]] : 0; + To SW = (ioff_<(int)dims[0] && _joff>=0) ? + iptr[_joff*strides[1]+ioff_*strides[0]] : 0; + To NE = (_ioff>=0 && joff_<(int)dims[1]) ? + iptr[joff_*strides[1]+_ioff*strides[0]] : 0; + To SE = (ioff_<(int)dims[0] && joff_<(int)dims[1]) ? + iptr[joff_*strides[1]+ioff_*strides[0]] : 0; + + if (isDX) { + To W = _joff>=0 ? + iptr[_joff*strides[1]+ioff*strides[0]] : 0; + + To E = joff_<(int)dims[1] ? + iptr[joff_*strides[1]+ioff*strides[0]] : 0; + + accum = NW+SW - (NE+SE) + 2*(W-E); + } else { + To N = _ioff>=0 ? + iptr[joff*strides[1]+_ioff*strides[0]] : 0; + + To S = ioff_<(int)dims[0] ? + iptr[joff*strides[1]+ioff_*strides[0]] : 0; + + accum = NW+NE - (SW+SE) + 2*(N-S); + } + + optr[joffset+i*strides[0]] = accum; + } + } + + optr += strides[2]; + iptr += strides[2]; + } + optr += strides[3]; + iptr += strides[3]; + } +} + +} +} diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp new file mode 100644 index 0000000000..292c6383dc --- /dev/null +++ b/src/backend/cpu/kernel/sort.hpp @@ -0,0 +1,52 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +// Based off of http://stackoverflow.com/a/12399290 +template +void sort0(Array val) +{ + // initialize original index locations + T *val_ptr = val.get(); + + function op = std::greater(); + if(isAscending) { op = std::less(); } + + T *comp_ptr = nullptr; + for(dim_t w = 0; w < val.dims()[3]; w++) { + dim_t valW = w * val.strides()[3]; + for(dim_t z = 0; z < val.dims()[2]; z++) { + dim_t valWZ = valW + z * val.strides()[2]; + for(dim_t y = 0; y < val.dims()[1]; y++) { + + dim_t valOffset = valWZ + y * val.strides()[1]; + + comp_ptr = val_ptr + valOffset; + std::sort(comp_ptr, comp_ptr + val.dims()[0], op); + } + } + } + return; +} + +} +} diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp new file mode 100644 index 0000000000..f9d391dc46 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key.hpp @@ -0,0 +1,86 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void sort0_by_key(Array okey, Array oval, Array oidx, + const Array ikey, const Array ival) +{ + function op = std::greater(); + if(isAscending) { op = std::less(); } + + // Get pointers and initialize original index locations + uint *oidx_ptr = oidx.get(); + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); + const Tk *ikey_ptr = ikey.get(); + const Tv *ival_ptr = ival.get(); + + std::vector seq_vec(oidx.dims()[0]); + std::iota(seq_vec.begin(), seq_vec.end(), 0); + + const Tk *comp_ptr = nullptr; + auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; + + for(dim_t w = 0; w < ikey.dims()[3]; w++) { + dim_t okeyW = w * okey.strides()[3]; + dim_t ovalW = w * oval.strides()[3]; + dim_t oidxW = w * oidx.strides()[3]; + dim_t ikeyW = w * ikey.strides()[3]; + dim_t ivalW = w * ival.strides()[3]; + + for(dim_t z = 0; z < ikey.dims()[2]; z++) { + dim_t okeyWZ = okeyW + z * okey.strides()[2]; + dim_t ovalWZ = ovalW + z * oval.strides()[2]; + dim_t oidxWZ = oidxW + z * oidx.strides()[2]; + dim_t ikeyWZ = ikeyW + z * ikey.strides()[2]; + dim_t ivalWZ = ivalW + z * ival.strides()[2]; + + for(dim_t y = 0; y < ikey.dims()[1]; y++) { + + dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; + dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; + dim_t oidxOffset = oidxWZ + y * oidx.strides()[1]; + dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1]; + dim_t ivalOffset = ivalWZ + y * ival.strides()[1]; + + uint *ptr = oidx_ptr + oidxOffset; + std::copy(seq_vec.begin(), seq_vec.end(), ptr); + + comp_ptr = ikey_ptr + ikeyOffset; + std::stable_sort(ptr, ptr + ikey.dims()[0], comparator); + + for (dim_t i = 0; i < oval.dims()[0]; ++i){ + uint sortIdx = oidx_ptr[oidxOffset + i]; + okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx]; + oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx]; + } + } + } + } + + return; +} + +} +} diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp new file mode 100644 index 0000000000..b71cc47071 --- /dev/null +++ b/src/backend/cpu/kernel/sort_index.hpp @@ -0,0 +1,71 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void sort0_index(Array val, Array idx, const Array in) +{ + // initialize original index locations + uint *idx_ptr = idx.get(); + T *val_ptr = val.get(); + const T *in_ptr = in.get(); + function op = std::greater(); + if(isAscending) { op = std::less(); } + + std::vector seq_vec(idx.dims()[0]); + std::iota(seq_vec.begin(), seq_vec.end(), 0); + + const T *comp_ptr = nullptr; + auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; + + for(dim_t w = 0; w < in.dims()[3]; w++) { + dim_t valW = w * val.strides()[3]; + dim_t idxW = w * idx.strides()[3]; + dim_t inW = w * in.strides()[3]; + for(dim_t z = 0; z < in.dims()[2]; z++) { + dim_t valWZ = valW + z * val.strides()[2]; + dim_t idxWZ = idxW + z * idx.strides()[2]; + dim_t inWZ = inW + z * in.strides()[2]; + for(dim_t y = 0; y < in.dims()[1]; y++) { + + dim_t valOffset = valWZ + y * val.strides()[1]; + dim_t idxOffset = idxWZ + y * idx.strides()[1]; + dim_t inOffset = inWZ + y * in.strides()[1]; + + uint *ptr = idx_ptr + idxOffset; + std::copy(seq_vec.begin(), seq_vec.end(), ptr); + + comp_ptr = in_ptr + inOffset; + std::stable_sort(ptr, ptr + in.dims()[0], comparator); + + for (dim_t i = 0; i < val.dims()[0]; ++i){ + val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]]; + } + } + } + } + + return; +} + +} +} diff --git a/src/backend/cpu/kernel/susan.hpp b/src/backend/cpu/kernel/susan.hpp new file mode 100644 index 0000000000..f543967799 --- /dev/null +++ b/src/backend/cpu/kernel/susan.hpp @@ -0,0 +1,99 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void susan_responses(Array output, const Array input, + const unsigned idim0, const unsigned idim1, + const int radius, const float t, const float g, + const unsigned border_len) +{ + T* resp_out = output.get(); + const T* in = input.get(); + + const unsigned r = border_len; + const int rSqrd = radius*radius; + + for (unsigned y = r; y < idim1 - r; ++y) { + for (unsigned x = r; x < idim0 - r; ++x) { + const unsigned idx = y * idim0 + x; + T m_0 = in[idx]; + float nM = 0.0f; + + for (int i=-radius; i<=radius; ++i) { + for (int j=-radius; j<=radius; ++j) { + if (i*i + j*j < rSqrd) { + int p = x + i; + int q = y + j; + T m = in[p + idim0 * q]; + float exp_pow = std::pow((m - m_0)/t, 6.0); + float cM = std::exp(-exp_pow); + nM += cM; + } + } + } + + resp_out[idx] = nM < g ? g - nM : T(0); + } + } +} + +template +void non_maximal(Array xcoords, Array ycoords, Array response, + shared_ptr counter, const unsigned idim0, const unsigned idim1, + const Array input, const unsigned border_len, const unsigned max_corners) +{ + float* x_out = xcoords.get(); + float* y_out = ycoords.get(); + float* resp_out = response.get(); + unsigned* count = counter.get(); + const T* resp_in= input.get(); + + // Responses on the border don't have 8-neighbors to compare, discard them + const unsigned r = border_len + 1; + + for (unsigned y = r; y < idim1 - r; y++) { + for (unsigned x = r; x < idim0 - r; x++) { + const T v = resp_in[y * idim0 + x]; + + // Find maximum neighborhood response + T max_v; + max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]); + max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]); + max_v = max(max_v, resp_in[(y-1) * idim0 + x ]); + max_v = max(max_v, resp_in[(y+1) * idim0 + x ]); + max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]); + max_v = max(max_v, resp_in[(y) * idim0 + x+1]); + max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]); + + // Stores corner to {x,y,resp}_out if it's response is maximum compared + // to its 8-neighborhood and greater or equal minimum response + if (v > max_v) { + const unsigned idx = *count; + *count += 1; + if (idx < max_corners) { + x_out[idx] = (float)x; + y_out[idx] = (float)y; + resp_out[idx] = (float)v; + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/tile.hpp b/src/backend/cpu/kernel/tile.hpp new file mode 100644 index 0000000000..3ad3009041 --- /dev/null +++ b/src/backend/cpu/kernel/tile.hpp @@ -0,0 +1,55 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void tile(Array out, const Array in) +{ + + T* outPtr = out.get(); + const T* inPtr = in.get(); + + const af::dim4 iDims = in.dims(); + const af::dim4 oDims = out.dims(); + const af::dim4 ist = in.strides(); + const af::dim4 ost = out.strides(); + + for(dim_t ow = 0; ow < oDims[3]; ow++) { + const dim_t iw = ow % iDims[3]; + const dim_t iW = iw * ist[3]; + const dim_t oW = ow * ost[3]; + for(dim_t oz = 0; oz < oDims[2]; oz++) { + const dim_t iz = oz % iDims[2]; + const dim_t iZW = iW + iz * ist[2]; + const dim_t oZW = oW + oz * ost[2]; + for(dim_t oy = 0; oy < oDims[1]; oy++) { + const dim_t iy = oy % iDims[1]; + const dim_t iYZW = iZW + iy * ist[1]; + const dim_t oYZW = oZW + oy * ost[1]; + for(dim_t ox = 0; ox < oDims[0]; ox++) { + const dim_t ix = ox % iDims[0]; + const dim_t iMem = iYZW + ix; + const dim_t oMem = oYZW + ox; + outPtr[oMem] = inPtr[iMem]; + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/transform.hpp b/src/backend/cpu/kernel/transform.hpp new file mode 100644 index 0000000000..2311e4efaa --- /dev/null +++ b/src/backend/cpu/kernel/transform.hpp @@ -0,0 +1,131 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void calc_transform_inverse(T *txo, const T *txi, const bool perspective) +{ + if (perspective) { + txo[0] = txi[4]*txi[8] - txi[5]*txi[7]; + txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]); + txo[2] = txi[1]*txi[5] - txi[2]*txi[4]; + + txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]); + txo[4] = txi[0]*txi[8] - txi[2]*txi[6]; + txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]); + + txo[6] = txi[3]*txi[7] - txi[4]*txi[6]; + txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]); + txo[8] = txi[0]*txi[4] - txi[1]*txi[3]; + + T det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6]; + + txo[0] /= det; txo[1] /= det; txo[2] /= det; + txo[3] /= det; txo[4] /= det; txo[5] /= det; + txo[6] /= det; txo[7] /= det; txo[8] /= det; + } + else { + T det = txi[0]*txi[4] - txi[1]*txi[3]; + + txo[0] = txi[4] / det; + txo[1] = txi[3] / det; + txo[3] = txi[1] / det; + txo[4] = txi[0] / det; + + txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; + txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; + } +} + +template +void calc_transform_inverse(T *tmat, const T *tmat_ptr, const bool inverse, + const bool perspective, const unsigned transf_len) +{ + // The way kernel is structured, it expects an inverse + // transform matrix by default. + // If it is an forward transform, then we need its inverse + if(inverse) { + for(int i = 0; i < (int)transf_len; i++) + tmat[i] = tmat_ptr[i]; + } else { + calc_transform_inverse(tmat, tmat_ptr, perspective); + } +} + +template +void transform(Array output, const Array input, + const Array transform, const bool inverse, + const bool perspective) +{ + const af::dim4 idims = input.dims(); + const af::dim4 odims = output.dims(); + const af::dim4 istrides = input.strides(); + const af::dim4 ostrides = output.strides(); + + T * out = output.get(); + const T * in = input.get(); + const float* tf = transform.get(); + + dim_t nimages = idims[2]; + // Multiplied in src/backend/transform.cpp + dim_t ntransforms = odims[2] / idims[2]; + + void (*t_fn)(T *, const T *, const float *, const af::dim4 &, + const af::dim4 &, const af::dim4 &, + const dim_t, const dim_t, const dim_t, const dim_t, + const bool); + + switch(method) { + case AF_INTERP_NEAREST: + t_fn = &transform_n; + break; + case AF_INTERP_BILINEAR: + t_fn = &transform_b; + break; + case AF_INTERP_LOWER: + t_fn = &transform_l; + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } + + const int transf_len = (perspective) ? 9 : 6; + + // For each transform channel + for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) { + // Compute inverse if required + const float *tmat_ptr = tf + t_idx * transf_len; + float* tmat = new float[transf_len]; + calc_transform_inverse(tmat, tmat_ptr, inverse, perspective, transf_len); + + // Offset for output pointer + dim_t o_offset = t_idx * nimages * ostrides[2]; + + // Do transform for image + for(int yy = 0; yy < (int)odims[1]; yy++) { + for(int xx = 0; xx < (int)odims[0]; xx++) { + t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy, perspective); + } + } + delete[] tmat; + } +} + +} +} diff --git a/src/backend/cpu/kernel/transpose.hpp b/src/backend/cpu/kernel/transpose.hpp new file mode 100644 index 0000000000..576de873ed --- /dev/null +++ b/src/backend/cpu/kernel/transpose.hpp @@ -0,0 +1,122 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +T getConjugate(const T &in) +{ + // For non-complex types return same + return in; +} + +template<> +cfloat getConjugate(const cfloat &in) +{ + return std::conj(in); +} + +template<> +cdouble getConjugate(const cdouble &in) +{ + return std::conj(in); +} + +template +void transpose(Array output, const Array input) +{ + const dim4 odims = output.dims(); + const dim4 ostrides = output.strides(); + const dim4 istrides = input.strides(); + + T * out = output.get(); + T const * const in = input.get(); + + for (dim_t l = 0; l < odims[3]; ++l) { + for (dim_t k = 0; k < odims[2]; ++k) { + // Outermost loop handles batch mode + // if input has no data along third dimension + // this loop runs only once + for (dim_t j = 0; j < odims[1]; ++j) { + for (dim_t i = 0; i < odims[0]; ++i) { + // calculate array indices based on offsets and strides + // the helper getIdx takes care of indices + const dim_t inIdx = getIdx(istrides,j,i,k,l); + const dim_t outIdx = getIdx(ostrides,i,j,k,l); + if(conjugate) + out[outIdx] = getConjugate(in[inIdx]); + else + out[outIdx] = in[inIdx]; + } + } + // outData and inData pointers doesn't need to be + // offset as the getIdx function is taking care + // of the batch parameter + } + } +} + +template +void transpose(Array out, const Array in, const bool conjugate) +{ + return (conjugate ? transpose(out, in) : transpose(out, in)); +} + +template +void transpose_inplace(Array input) +{ + const dim4 idims = input.dims(); + const dim4 istrides = input.strides(); + + T * in = input.get(); + + for (dim_t l = 0; l < idims[3]; ++l) { + for (dim_t k = 0; k < idims[2]; ++k) { + // Outermost loop handles batch mode + // if input has no data along third dimension + // this loop runs only once + // + // Run only bottom triangle. std::swap swaps with upper triangle + for (dim_t j = 0; j < idims[1]; ++j) { + for (dim_t i = j + 1; i < idims[0]; ++i) { + // calculate array indices based on offsets and strides + // the helper getIdx takes care of indices + const dim_t iIdx = getIdx(istrides,j,i,k,l); + const dim_t oIdx = getIdx(istrides,i,j,k,l); + if(conjugate) { + in[iIdx] = getConjugate(in[iIdx]); + in[oIdx] = getConjugate(in[oIdx]); + std::swap(in[iIdx], in[oIdx]); + } + else { + std::swap(in[iIdx], in[oIdx]); + } + } + } + } + } +} + +template +void transpose_inplace(Array in, const bool conjugate) +{ + return (conjugate ? transpose_inplace(in) : transpose_inplace(in)); +} + +} +} diff --git a/src/backend/cpu/kernel/triangle.hpp b/src/backend/cpu/kernel/triangle.hpp new file mode 100644 index 0000000000..7059de5981 --- /dev/null +++ b/src/backend/cpu/kernel/triangle.hpp @@ -0,0 +1,61 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void triangle(Array out, const Array in) +{ + T *o = out.get(); + const T *i = in.get(); + + af::dim4 odm = out.dims(); + + af::dim4 ost = out.strides(); + af::dim4 ist = in.strides(); + + for(dim_t ow = 0; ow < odm[3]; ow++) { + const dim_t oW = ow * ost[3]; + const dim_t iW = ow * ist[3]; + + for(dim_t oz = 0; oz < odm[2]; oz++) { + const dim_t oZW = oW + oz * ost[2]; + const dim_t iZW = iW + oz * ist[2]; + + for(dim_t oy = 0; oy < odm[1]; oy++) { + const dim_t oYZW = oZW + oy * ost[1]; + const dim_t iYZW = iZW + oy * ist[1]; + + for(dim_t ox = 0; ox < odm[0]; ox++) { + const dim_t oMem = oYZW + ox; + const dim_t iMem = iYZW + ox; + + bool cond = is_upper ? (oy >= ox) : (oy <= ox); + bool do_unit_diag = (is_unit_diag && ox == oy); + if(cond) { + o[oMem] = do_unit_diag ? scalar(1) : i[iMem]; + } else { + o[oMem] = scalar(0); + } + + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/unwrap.hpp b/src/backend/cpu/kernel/unwrap.hpp new file mode 100644 index 0000000000..1d996ff1f3 --- /dev/null +++ b/src/backend/cpu/kernel/unwrap.hpp @@ -0,0 +1,81 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void unwrap_dim(Array out, const Array in, const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) +{ + const T *inPtr = in.get(); + T *outPtr = out.get(); + + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + af::dim4 istrides = in.strides(); + af::dim4 ostrides = out.strides(); + + dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + + dim_t cOut = w * ostrides[3] + z * ostrides[2]; + dim_t cIn = w * istrides[3] + z * istrides[2]; + const T* iptr = inPtr + cIn; + T* optr_= outPtr + cOut; + + for(dim_t col = 0; col < odims[d]; col++) { + // Offset output ptr + T* optr = optr_ + col * ostrides[d]; + + // Calculate input window index + dim_t winy = (col / nx); + dim_t winx = (col % nx); + + dim_t startx = winx * sx; + dim_t starty = winy * sy; + + dim_t spx = startx - px; + dim_t spy = starty - py; + + // Short cut condition ensuring all values within input dimensions + bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]); + + for(dim_t y = 0; y < wy; y++) { + for(dim_t x = 0; x < wx; x++) { + dim_t xpad = spx + x; + dim_t ypad = spy + y; + + dim_t oloc = (y * wx + x); + if (d == 0) oloc *= ostrides[1]; + + if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) { + dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]); + optr[oloc] = iptr[iloc]; + } else { + optr[oloc] = scalar(0.0); + } + } + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/wrap.hpp b/src/backend/cpu/kernel/wrap.hpp new file mode 100644 index 0000000000..70be3ad652 --- /dev/null +++ b/src/backend/cpu/kernel/wrap.hpp @@ -0,0 +1,80 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void wrap_dim(Array out, const Array in, const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) +{ + const T *inPtr = in.get(); + T *outPtr = out.get(); + + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + af::dim4 istrides = in.strides(); + af::dim4 ostrides = out.strides(); + + dim_t nx = (odims[0] + 2 * px - wx) / sx + 1; + + for(dim_t w = 0; w < idims[3]; w++) { + for(dim_t z = 0; z < idims[2]; z++) { + + dim_t cIn = w * istrides[3] + z * istrides[2]; + dim_t cOut = w * ostrides[3] + z * ostrides[2]; + const T* iptr_ = inPtr + cIn; + T* optr= outPtr + cOut; + + for(dim_t col = 0; col < idims[d]; col++) { + // Offset output ptr + const T* iptr = iptr_ + col * istrides[d]; + + // Calculate input window index + dim_t winy = (col / nx); + dim_t winx = (col % nx); + + dim_t startx = winx * sx; + dim_t starty = winy * sy; + + dim_t spx = startx - px; + dim_t spy = starty - py; + + // Short cut condition ensuring all values within input dimensions + bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]); + + for(dim_t y = 0; y < wy; y++) { + for(dim_t x = 0; x < wx; x++) { + dim_t xpad = spx + x; + dim_t ypad = spy + y; + + dim_t iloc = (y * wx + x); + if (d == 0) iloc *= istrides[1]; + + if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) { + dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]); + // FIXME: When using threads, atomize this + optr[oloc] += iptr[iloc]; + } + } + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/lapack_helper.hpp b/src/backend/cpu/lapack_helper.hpp index f978ecb92b..c5ed4fa83f 100644 --- a/src/backend/cpu/lapack_helper.hpp +++ b/src/backend/cpu/lapack_helper.hpp @@ -17,17 +17,17 @@ #define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR #define LAPACK_NAME(fn) LAPACKE_##fn -#ifdef __APPLE__ -#include -#include -#undef AF_LAPACK_COL_MAJOR -#define AF_LAPACK_COL_MAJOR 0 -#else #ifdef USE_MKL -#include -#else // NETLIB LAPACKE -#include -#endif + #include +#else + #ifdef __APPLE__ + #include + #include + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 + #else // NETLIB LAPACKE + #include + #endif #endif #endif diff --git a/src/backend/cpu/lookup.cpp b/src/backend/cpu/lookup.cpp index 128cc02823..1e09f4dd48 100644 --- a/src/backend/cpu/lookup.cpp +++ b/src/backend/cpu/lookup.cpp @@ -8,33 +8,21 @@ ********************************************************/ #include -#include #include +#include +#include +#include namespace cpu { -static inline -dim_t trimIndex(int idx, const dim_t &len) -{ - int ret_val = idx; - int offset = abs(ret_val)%len; - if (ret_val<0) { - ret_val = offset-1; - } else if (ret_val>=len) { - ret_val = len-offset-1; - } - return ret_val; -} - template Array lookup(const Array &input, const Array &indices, const unsigned dim) { - const dim4 iDims = input.dims(); - const dim4 iStrides = input.strides(); + input.eval(); + indices.eval(); - const in_t *inPtr = input.get(); - const idx_t *idxPtr = indices.get(); + const dim4 iDims = input.dims(); dim4 oDims(1); for (int d=0; d<4; ++d) @@ -42,35 +30,7 @@ Array lookup(const Array &input, const Array &indices, const Array out = createEmptyArray(oDims); - dim4 oStrides = out.strides(); - - in_t *outPtr = out.get(); - - for (dim_t l=0; l, out, input, indices, dim); return out; } diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index 0eefb16816..24ca4acd78 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -11,23 +11,21 @@ #include #if defined(WITH_CPU_LINEAR_ALGEBRA) - #include #include #include #include -#include - #include #include +#include +#include +#include namespace cpu { template -using getrf_func_def = int (*)(ORDER_TYPE, int, int, - T*, int, - int*); +using getrf_func_def = int (*)(ORDER_TYPE, int, int, T*, int, int*); #define LU_FUNC_DEF( FUNC ) \ template FUNC##_func_def FUNC##_func(); @@ -43,78 +41,14 @@ LU_FUNC(getrf , double , d) LU_FUNC(getrf , cfloat , c) LU_FUNC(getrf , cdouble, z) -template -void lu_split(Array &lower, Array &upper, const Array &in) -{ - T *l = lower.get(); - T *u = upper.get(); - const T *i = in.get(); - - dim4 ldm = lower.dims(); - dim4 udm = upper.dims(); - dim4 idm = in.dims(); - - dim4 lst = lower.strides(); - dim4 ust = upper.strides(); - dim4 ist = in.strides(); - - for(dim_t ow = 0; ow < idm[3]; ow++) { - const dim_t lW = ow * lst[3]; - const dim_t uW = ow * ust[3]; - const dim_t iW = ow * ist[3]; - - for(dim_t oz = 0; oz < idm[2]; oz++) { - const dim_t lZW = lW + oz * lst[2]; - const dim_t uZW = uW + oz * ust[2]; - const dim_t iZW = iW + oz * ist[2]; - - for(dim_t oy = 0; oy < idm[1]; oy++) { - const dim_t lYZW = lZW + oy * lst[1]; - const dim_t uYZW = uZW + oy * ust[1]; - const dim_t iYZW = iZW + oy * ist[1]; - - for(dim_t ox = 0; ox < idm[0]; ox++) { - const dim_t lMem = lYZW + ox; - const dim_t uMem = uYZW + ox; - const dim_t iMem = iYZW + ox; - if(ox > oy) { - if(oy < ldm[1]) - l[lMem] = i[iMem]; - if(ox < udm[0]) - u[uMem] = scalar(0); - } else if (oy > ox) { - if(oy < ldm[1]) - l[lMem] = scalar(0); - if(ox < udm[0]) - u[uMem] = i[iMem]; - } else if(ox == oy) { - if(oy < ldm[1]) - l[lMem] = scalar(1.0); - if(ox < udm[0]) - u[uMem] = i[iMem]; - } - } - } - } - } -} - -void convertPivot(Array &pivot, int out_sz) -{ - Array p = range(dim4(out_sz), 0); - int *d_pi = pivot.get(); - int *d_po = p.get(); - dim_t d0 = pivot.dims()[0]; - for(int j = 0; j < (int)d0; j++) { - // 1 indexed in pivot - std::swap(d_po[j], d_po[d_pi[j] - 1]); - } - pivot = p; -} - template void lu(Array &lower, Array &upper, Array &pivot, const Array &in) { + lower.eval(); + upper.eval(); + pivot.eval(); + in.eval(); + dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; @@ -128,35 +62,36 @@ void lu(Array &lower, Array &upper, Array &pivot, const Array &in) lower = createEmptyArray(ldims); upper = createEmptyArray(udims); - lu_split(lower, upper, in_copy); + getQueue().enqueue(kernel::lu_split, lower, upper, in_copy); } template Array lu_inplace(Array &in, const bool convert_pivot) { - dim4 iDims = in.dims(); - int M = iDims[0]; - int N = iDims[1]; - - Array pivot = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); + in.eval(); - getrf_func()(AF_LAPACK_COL_MAJOR, M, N, - in.get(), in.strides()[1], - pivot.get()); - - if(convert_pivot) convertPivot(pivot, M); - - return pivot; + dim4 iDims = in.dims(); + Array pivot = createEmptyArray(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1)); + + auto func = [=] (Array in, Array pivot) { + dim4 iDims = in.dims(); + getrf_func()(AF_LAPACK_COL_MAJOR, iDims[0], iDims[1], in.get(), in.strides()[1], pivot.get()); + }; + getQueue().enqueue(func, in, pivot); + + if(convert_pivot) { + Array p = range(dim4(iDims[0]), 0); + getQueue().enqueue(kernel::convertPivot, p, pivot); + return p; + } else { + return pivot; + } } -#define INSTANTIATE_LU(T) \ - template Array lu_inplace(Array &in, const bool convert_pivot); \ - template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); - -INSTANTIATE_LU(float) -INSTANTIATE_LU(cfloat) -INSTANTIATE_LU(double) -INSTANTIATE_LU(cdouble) +bool isLAPACKAvailable() +{ + return true; +} } @@ -177,6 +112,18 @@ Array lu_inplace(Array &in, const bool convert_pivot) AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); } +bool isLAPACKAvailable() +{ + return false; +} + +} + +#endif + +namespace cpu +{ + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -187,5 +134,3 @@ INSTANTIATE_LU(double) INSTANTIATE_LU(cdouble) } - -#endif diff --git a/src/backend/cpu/lu.hpp b/src/backend/cpu/lu.hpp index c25dcaaa16..3fef461067 100644 --- a/src/backend/cpu/lu.hpp +++ b/src/backend/cpu/lu.hpp @@ -17,4 +17,6 @@ namespace cpu template Array lu_inplace(Array &in, const bool convert_pivot = true); + + bool isLAPACKAvailable(); } diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp index 4d930145d5..58091a1f49 100644 --- a/src/backend/cpu/match_template.cpp +++ b/src/backend/cpu/match_template.cpp @@ -12,132 +12,24 @@ #include #include #include -#include +#include +#include +#include using af::dim4; namespace cpu { -template -Array match_template(const Array &sImg, const Array &tImg) +template +Array match_template(const Array &sImg, const Array &tImg) { - const dim4 sDims = sImg.dims(); - const dim4 tDims = tImg.dims(); - const dim4 sStrides = sImg.strides(); - const dim4 tStrides = tImg.strides(); + sImg.eval(); + tImg.eval(); - const dim_t tDim0 = tDims[0]; - const dim_t tDim1 = tDims[1]; - const dim_t sDim0 = sDims[0]; - const dim_t sDim1 = sDims[1]; + Array out = createEmptyArray(sImg.dims()); - Array out = createEmptyArray(sDims); - const dim4 oStrides = out.strides(); - - outType tImgMean = outType(0); - dim_t winNumElements = tImg.elements(); - bool needMean = mType==AF_ZSAD || mType==AF_LSAD || - mType==AF_ZSSD || mType==AF_LSSD || - mType==AF_ZNCC; - const inType * tpl = tImg.get(); - - if (needMean) { - for(dim_t tj=0; tj, out, sImg, tImg); return out; } diff --git a/src/backend/cpu/math.cpp b/src/backend/cpu/math.cpp index 5a6bcbc67e..e00fd78fcd 100644 --- a/src/backend/cpu/math.cpp +++ b/src/backend/cpu/math.cpp @@ -11,39 +11,41 @@ namespace cpu { - uint abs(uint val) { return val; } - uchar abs(uchar val) { return val; } - uintl abs(uintl val) { return val; } - - cfloat scalar(float val) - { - cfloat cval = {(float)val, 0}; - return cval; - } - - cdouble scalar(double val) - { - cdouble cval = {val, 0}; - return cval; - } - - cfloat min(cfloat lhs, cfloat rhs) - { - return abs(lhs) < abs(rhs) ? lhs : rhs; - } - - cdouble min(cdouble lhs, cdouble rhs) - { - return abs(lhs) < abs(rhs) ? lhs : rhs; - } - - cfloat max(cfloat lhs, cfloat rhs) - { - return abs(lhs) > abs(rhs) ? lhs : rhs; - } - - cdouble max(cdouble lhs, cdouble rhs) - { - return abs(lhs) > abs(rhs) ? lhs : rhs; - } + +uint abs(uint val) { return val; } +uchar abs(uchar val) { return val; } +uintl abs(uintl val) { return val; } + +cfloat scalar(float val) +{ + cfloat cval = {(float)val, 0}; + return cval; +} + +cdouble scalar(double val) +{ + cdouble cval = {val, 0}; + return cval; +} + +cfloat min(cfloat lhs, cfloat rhs) +{ + return abs(lhs) < abs(rhs) ? lhs : rhs; +} + +cdouble min(cdouble lhs, cdouble rhs) +{ + return abs(lhs) < abs(rhs) ? lhs : rhs; +} + +cfloat max(cfloat lhs, cfloat rhs) +{ + return abs(lhs) > abs(rhs) ? lhs : rhs; +} + +cdouble max(cdouble lhs, cdouble rhs) +{ + return abs(lhs) > abs(rhs) ? lhs : rhs; +} + } diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp index b52eaf9387..b5bbf758a1 100644 --- a/src/backend/cpu/meanshift.cpp +++ b/src/backend/cpu/meanshift.cpp @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include using af::dim4; using std::vector; @@ -23,125 +26,15 @@ using std::vector; namespace cpu { -inline dim_t clamp(dim_t a, dim_t mn, dim_t mx) -{ - return (amx ? mx : a)); -} - template Array meanshift(const Array &in, const float &s_sigma, const float &c_sigma, const unsigned iter) { - const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); - - const dim_t bCount = (is_color ? 1 : dims[2]); - const dim_t channels = (is_color ? dims[2] : 1); - - // clamp spatical and chromatic sigma's - float space_ = std::min(11.5f, s_sigma); - const dim_t radius = std::max((int)(space_ * 1.5f), 1); - const float cvar = c_sigma*c_sigma; - - vector means; - vector centers; - vector tmpclrs; - means.reserve(channels); - centers.reserve(channels); - tmpclrs.reserve(channels); - - T *outData = out.get(); - const T * inData = in.get(); - - for(dim_t b3=0; b31 - // i.e for color images where batch is along fourth dimension - centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]]; - } - - // scope of meanshift iterationd begin - for(unsigned it=0; it out = createEmptyArray(in.dims()); - for(dim_t ch=0; ch, out, in, s_sigma, c_sigma, iter); - } - } - outData += ostrides[2]; - inData += istrides[2]; - } - } return out; } diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp index 3ded3c045a..8ae4e33921 100644 --- a/src/backend/cpu/medfilt.cpp +++ b/src/backend/cpu/medfilt.cpp @@ -12,8 +12,9 @@ #include #include #include -#include -#include +#include +#include +#include using af::dim4; @@ -23,114 +24,11 @@ namespace cpu template Array medfilt(const Array &in, dim_t w_len, dim_t w_wid) { - const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); + in.eval(); - std::vector wind_vals; - wind_vals.reserve(w_len*w_wid); + Array out = createEmptyArray(in.dims()); - T const * in_ptr = in.get(); - T * out_ptr = out.get(); - - for(int b3=0; b3<(int)dims[3]; b3++) { - - for(int b2=0; b2<(int)dims[2]; b2++) { - - for(int col=0; col<(int)dims[1]; col++) { - - int ocol_off = col*ostrides[1]; - - for(int row=0; row<(int)dims[0]; row++) { - - wind_vals.clear(); - - for(int wj=0; wj<(int)w_wid; ++wj) { - - bool isColOff = false; - - int im_col = col + wj-w_wid/2; - int im_coff; - switch(pad) { - case AF_PAD_ZERO: - im_coff = im_col * istrides[1]; - if (im_col < 0 || im_col>=(int)dims[1]) - isColOff = true; - break; - case AF_PAD_SYM: - { - if (im_col < 0) { - im_col *= -1; - isColOff = true; - } - - if (im_col>=(int)dims[1]) { - im_col = 2*((int)dims[1]-1) - im_col; - isColOff = true; - } - - im_coff = im_col * istrides[1]; - } - break; - } - - for(int wi=0; wi<(int)w_len; ++wi) { - - bool isRowOff = false; - - int im_row = row + wi-w_len/2; - int im_roff; - switch(pad) { - case AF_PAD_ZERO: - im_roff = im_row * istrides[0]; - if (im_row < 0 || im_row>=(int)dims[0]) - isRowOff = true; - break; - case AF_PAD_SYM: - { - if (im_row < 0) { - im_row *= -1; - isRowOff = true; - } - - if (im_row>=(int)dims[0]) { - im_row = 2*((int)dims[0]-1) - im_row; - isRowOff = true; - } - - im_roff = im_row * istrides[0]; - } - break; - } - - if(isRowOff || isColOff) { - switch(pad) { - case AF_PAD_ZERO: - wind_vals.push_back(0); - break; - case AF_PAD_SYM: - wind_vals.push_back(in_ptr[im_coff+im_roff]); - break; - } - } else - wind_vals.push_back(in_ptr[im_coff+im_roff]); - } - } - - std::stable_sort(wind_vals.begin(),wind_vals.end()); - int off = wind_vals.size()/2; - if (wind_vals.size()%2==0) - out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2; - else { - out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off]; - } - } - } - in_ptr += istrides[2]; - out_ptr += ostrides[2]; - } - } + getQueue().enqueue(kernel::medfilt, out, in, w_len, w_wid); return out; } diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index ac10643c9b..b4b1b450d9 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -10,237 +10,195 @@ #include #include #include -#include -#include -#include -#include +#include +#include +#include +#include -namespace cpu -{ - - static size_t memory_resolution = 1024; //1KB +#ifndef AF_MEM_DEBUG +#define AF_MEM_DEBUG 0 +#endif - void setMemStepSize(size_t step_bytes) - { - memory_resolution = step_bytes; - } +#ifndef AF_CPU_MEM_DEBUG +#define AF_CPU_MEM_DEBUG 0 +#endif - size_t getMemStepSize(void) - { - return memory_resolution; - } +namespace cpu +{ - class Manager +class MemoryManager : public common::MemoryManager +{ + int getActiveDeviceId(); + size_t getMaxMemorySize(int id); +public: + MemoryManager(); + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); + ~MemoryManager() { - public: - static bool initialized; - Manager() - { - initialized = true; - } - - ~Manager() - { - garbageCollect(); + common::lock_guard_t lock(this->memory_mutex); + for (int n = 0; n < getDeviceCount(); n++) { + cpu::setDevice(n); + this->garbageCollect(); } - }; - - bool Manager::initialized = false; - - static void managerInit() - { - if(Manager::initialized == false) - static Manager pm = Manager(); } +}; - typedef struct - { - bool is_free; - bool is_unlinked; - size_t bytes; - } mem_info; - - static size_t used_bytes = 0; - static size_t used_buffers = 0; - static size_t total_bytes = 0; - typedef std::map mem_t; - typedef mem_t::iterator mem_iter; - - mem_t memory_map; - std::mutex memory_map_mutex; - - template - void freeWrapper(T *ptr) - { - free((void *)ptr); - } - - void garbageCollect() - { - for(mem_iter iter = memory_map.begin(); - iter != memory_map.end(); ++iter) { - - if ((iter->second).is_free) { - - if (!(iter->second).is_unlinked) { - freeWrapper(iter->first); - total_bytes -= iter->second.bytes; - } - } - } - - mem_iter memory_curr = memory_map.begin(); - mem_iter memory_end = memory_map.end(); +int MemoryManager::getActiveDeviceId() +{ + return cpu::getActiveDeviceId(); +} - while(memory_curr != memory_end) { - if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) { - memory_map.erase(memory_curr++); - } else { - ++memory_curr; - } - } - } +size_t MemoryManager::getMaxMemorySize(int id) +{ + return cpu::getDeviceMemorySize(id); +} - template - T* memAlloc(const size_t &elements) - { - managerInit(); +MemoryManager::MemoryManager() : + common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CPU_MEM_DEBUG) +{ + this->setMaxMemorySize(); +} - T* ptr = NULL; - size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution; - if (elements > 0) { - std::lock_guard lock(memory_map_mutex); +void *MemoryManager::nativeAlloc(const size_t bytes) +{ + void *ptr = malloc(bytes); + if (!ptr) AF_ERROR("Unable to allocate memory", AF_ERR_NO_MEM); + return ptr; +} - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (memory_map.size() > MAX_BUFFERS || - used_bytes >= MAX_BYTES) { +void MemoryManager::nativeFree(void *ptr) +{ + return free((void *)ptr); +} - garbageCollect(); - } +static MemoryManager &getMemoryManager() +{ + static MemoryManager instance; + return instance; +} - for(mem_iter iter = memory_map.begin(); - iter != memory_map.end(); ++iter) { +void setMemStepSize(size_t step_bytes) +{ + getMemoryManager().setMemStepSize(step_bytes); +} - mem_info info = iter->second; +size_t getMemStepSize(void) +{ + return getMemoryManager().getMemStepSize(); +} - if ( info.is_free && - !info.is_unlinked && - info.bytes == alloc_bytes) { +size_t getMaxBytes() +{ + return getMemoryManager().getMaxBytes(); +} - iter->second.is_free = false; - used_bytes += alloc_bytes; - used_buffers++; - return (T *)iter->first; - } - } +unsigned getMaxBuffers() +{ + return getMemoryManager().getMaxBuffers(); +} - // Perform garbage collection if memory can not be allocated - ptr = (T *)malloc(alloc_bytes); +void garbageCollect() +{ + getMemoryManager().garbageCollect(); +} - if (ptr == NULL) { - AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM); - } +void printMemInfo(const char *msg, const int device) +{ + getMemoryManager().printInfo(msg, device); +} - mem_info info = {false, false, alloc_bytes}; - memory_map[ptr] = info; +template +T* memAlloc(const size_t &elements) +{ + T *ptr = nullptr; - used_bytes += alloc_bytes; - used_buffers++; - total_bytes += alloc_bytes; - } - return ptr; + try { + ptr = (T *)getMemoryManager().alloc(elements * sizeof(T), false); + } catch(...) { + getQueue().sync(); + ptr = (T *)getMemoryManager().alloc(elements * sizeof(T), false); } + return ptr; +} - template - void memFree(T *ptr) - { - std::lock_guard lock(memory_map_mutex); - - mem_iter iter = memory_map.find((void *)ptr); - - if (iter != memory_map.end()) { - - iter->second.is_free = true; - if ((iter->second).is_unlinked) return; - - used_bytes -= iter->second.bytes; - used_buffers--; +void* memAllocUser(const size_t &bytes) +{ + void *ptr = nullptr; - } else { - freeWrapper(ptr); // Free it because we are not sure what the size is - } + try { + ptr = getMemoryManager().alloc(bytes, true); + } catch(...) { + getQueue().sync(); + ptr = getMemoryManager().alloc(bytes, true); } + return ptr; +} - template - void memPop(const T *ptr) - { - std::lock_guard lock(memory_map_mutex); +template +void memFree(T *ptr) +{ + return getMemoryManager().unlock((void *)ptr, false); +} - mem_iter iter = memory_map.find((void *)ptr); +void memFreeUser(void *ptr) +{ + getMemoryManager().unlock((void *)ptr, true); +} - if (iter != memory_map.end()) { - iter->second.is_unlinked = true; - } else { - mem_info info = { false, - true, - 100 }; //This number is not relevant +void memLock(const void *ptr) +{ + getMemoryManager().userLock((void *)ptr); +} - memory_map[(void *)ptr] = info; - } - } +void memUnlock(const void *ptr) +{ + getMemoryManager().userUnlock((void *)ptr); +} - template - void memPush(const T *ptr) - { - std::lock_guard lock(memory_map_mutex); - mem_iter iter = memory_map.find((void *)ptr); - if (iter != memory_map.end()) { - iter->second.is_unlinked = false; - } - } +void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + getQueue().sync(); + getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers, + lock_bytes, lock_buffers); +} +template +T* pinnedAlloc(const size_t &elements) +{ + return (T *)getMemoryManager().alloc(elements * sizeof(T), false); +} - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) - { - if (alloc_bytes ) *alloc_bytes = total_bytes; - if (alloc_buffers ) *alloc_buffers = memory_map.size(); - if (lock_bytes ) *lock_bytes = used_bytes; - if (lock_buffers ) *lock_buffers = used_buffers; - } +template +void pinnedFree(T* ptr) +{ + return getMemoryManager().unlock((void *)ptr, false); +} - template - T* pinnedAlloc(const size_t &elements) - { - return memAlloc(elements); - } +bool checkMemoryLimit() +{ + return getMemoryManager().checkMemoryLimit(); +} - template - void pinnedFree(T* ptr) - { - memFree(ptr); - } +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ + +INSTANTIATE(float) +INSTANTIATE(cfloat) +INSTANTIATE(double) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(ushort) +INSTANTIATE(short ) -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ - - INSTANTIATE(float) - INSTANTIATE(cfloat) - INSTANTIATE(double) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(ushort) - INSTANTIATE(short ) } diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 0b1c960ed4..91116fbcfc 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -9,24 +9,35 @@ #pragma once #include + namespace cpu { template T* memAlloc(const size_t &elements); + void *memAllocUser(const size_t &bytes); + + // Need these as 2 separate function and not a default argument + // This is because it is used as the deleter in shared pointer + // which cannot support default arguments template void memFree(T* ptr); - template void memPop(const T *ptr); - template void memPush(const T *ptr); + void memFreeUser(void* ptr); + + void memLock(const void *ptr); + void memUnlock(const void *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 100; - static const unsigned MAX_BYTES = 100 * (1 << 20); + size_t getMaxBytes(); + unsigned getMaxBuffers(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); void garbageCollect(); void pinnedGarbageCollect(); + void printMemInfo(const char *msg, const int device); + void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); + bool checkMemoryLimit(); } diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp index eb2e1de339..1ae4680b9d 100644 --- a/src/backend/cpu/morph.cpp +++ b/src/backend/cpu/morph.cpp @@ -13,78 +13,24 @@ #include #include #include +#include +#include +#include using af::dim4; namespace cpu { -static inline unsigned getIdx(const dim4 &strides, - int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i * strides[0]); -} - template Array morph(const Array &in, const Array &mask) { - const dim4 dims = in.dims(); - const dim4 window = mask.dims(); - const dim_t R0 = window[0]/2; - const dim_t R1 = window[1]/2; - const dim4 istrides = in.strides(); - const dim4 fstrides = mask.strides(); - - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); - - T* outData = out.get(); - const T* inData = in.get(); - const T* filter = mask.get(); - - for(dim_t b3=0; b3 (T)0) && offi>=0 && offj>=0 && offi out = createEmptyArray(in.dims()); - } // window 1st dimension loop ends here - } // filter window loop ends here - - outData[ getIdx(ostrides, i, j) ] = filterResult; - } //1st dimension loop ends here - } // 2nd dimension loop ends here - - // next iteration will be next batch if any - outData += ostrides[2]; - inData += istrides[2]; - } - } + getQueue().enqueue(kernel::morph, out, in, mask); return out; } @@ -92,66 +38,12 @@ Array morph(const Array &in, const Array &mask) template Array morph3d(const Array &in, const Array &mask) { - const dim4 dims = in.dims(); - const dim4 window = mask.dims(); - const dim_t R0 = window[0]/2; - const dim_t R1 = window[1]/2; - const dim_t R2 = window[2]/2; - const dim4 istrides = in.strides(); - const dim4 fstrides = mask.strides(); - const dim_t bCount = dims[3]; - - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); - - T* outData = out.get(); - const T* inData = in.get(); - const T* filter = mask.get(); - - for(dim_t batchId=0; batchId (T)0) && offi>=0 && offj>=0 && offk>=0 && - offi out = createEmptyArray(in.dims()); - outData[ getIdx(ostrides, i, j, k) ] = filterResult; - } //1st dimension loop ends here - } // 2nd dimension loop ends here - } // 3rd dimension loop ends here - // next iteration will be next batch if any - outData += ostrides[3]; - inData += istrides[3]; - } + getQueue().enqueue(kernel::morph3d, out, in, mask); return out; } diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp index 79d41516e3..17e892f492 100644 --- a/src/backend/cpu/nearest_neighbour.cpp +++ b/src/backend/cpu/nearest_neighbour.cpp @@ -11,157 +11,50 @@ #include #include #include -#include #include +#include +#include +#include using af::dim4; namespace cpu { -#if defined(_WIN32) || defined(_MSC_VER) - -#include -#define __builtin_popcount __popcnt - -#endif - -template -struct dist_op -{ - To operator()(T v1, T v2) - { - return v1 - v2; // Garbage distance - } -}; - template -struct dist_op -{ - To operator()(T v1, T v2) - { - return std::abs((double)v1 - (double)v2); - } -}; - -template -struct dist_op -{ - To operator()(T v1, T v2) - { - return (v1 - v2) * (v1 - v2); - } -}; - -template -struct dist_op -{ - To operator()(uint v1, uint v2) - { - return __builtin_popcount(v1 ^ v2); - } -}; - -template -struct dist_op -{ - To operator()(uintl v1, uintl v2) - { - return __builtin_popcount(v1 ^ v2); - } -}; - -template -struct dist_op -{ - To operator()(uchar v1, uchar v2) - { - return __builtin_popcount(v1 ^ v2); - } -}; - -template -struct dist_op -{ - To operator()(ushort v1, ushort v2) - { - return __builtin_popcount(v1 ^ v2); - } -}; - -template -void nearest_neighbour_(Array& idx, Array& dist, - const Array& query, const Array& train, - const uint dist_dim, const uint n_dist) +void nearest_neighbour(Array& idx, Array& dist, + const Array& query, const Array& train, + const uint dist_dim, const uint n_dist, + const af_match_type dist_type) { - uint sample_dim = (dist_dim == 0) ? 1 : 0; - const dim4 qDims = query.dims(); - const dim4 tDims = train.dims(); - if (n_dist > 1) { CPU_NOT_SUPPORTED(); } - const unsigned distLength = qDims[dist_dim]; - const unsigned nQuery = qDims[sample_dim]; - const unsigned nTrain = tDims[sample_dim]; + idx.eval(); + dist.eval(); + query.eval(); + train.eval(); - const dim4 outDims(n_dist, nQuery); + uint sample_dim = (dist_dim == 0) ? 1 : 0; + const dim4 qDims = query.dims(); + const dim4 outDims(n_dist, qDims[sample_dim]); idx = createEmptyArray(outDims); dist = createEmptyArray(outDims); - const T* qPtr = query.get(); - const T* tPtr = train.get(); - uint* iPtr = idx.get(); - To* dPtr = dist.get(); - - dist_op op; - - for (unsigned i = 0; i < nQuery; i++) { - To best_dist = limit_max(); - unsigned best_idx = 0; - - for (unsigned j = 0; j < nTrain; j++) { - To local_dist = 0; - for (unsigned k = 0; k < distLength; k++) { - size_t qIdx, tIdx; - if (sample_dim == 0) { - qIdx = k * qDims[0] + i; - tIdx = k * tDims[0] + j; - } - else { - qIdx = i * qDims[0] + k; - tIdx = j * tDims[0] + k; - } - - local_dist += op(qPtr[qIdx], tPtr[tIdx]); - } - - if (local_dist < best_dist) { - best_dist = local_dist; - best_idx = j; - } - } - - size_t oIdx; - oIdx = i; - iPtr[oIdx] = best_idx; - dPtr[oIdx] = best_dist; - } -} - -template -void nearest_neighbour(Array& idx, Array& dist, - const Array& query, const Array& train, - const uint dist_dim, const uint n_dist, - const af_match_type dist_type) -{ switch(dist_type) { - case AF_SAD: nearest_neighbour_(idx, dist, query, train, dist_dim, n_dist); break; - case AF_SSD: nearest_neighbour_(idx, dist, query, train, dist_dim, n_dist); break; - case AF_SHD: nearest_neighbour_(idx, dist, query, train, dist_dim, n_dist); break; - default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED); + case AF_SAD: + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + break; + case AF_SSD: + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + break; + case AF_SHD: + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + break; + default: + AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED); } } diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp index d279ba514f..8bbfd41932 100644 --- a/src/backend/cpu/orb.cpp +++ b/src/backend/cpu/orb.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -19,520 +18,15 @@ #include #include #include +#include +#include +#include using af::dim4; namespace cpu { -static const float PI_VAL = 3.14159265358979323846f; - -// Reference pattern, generated for a patch size of 31x31, as suggested by -// original ORB paper -#define REF_PAT_SIZE 31 -#define REF_PAT_SAMPLES 256 -#define REF_PAT_COORDS 4 -#define REF_PAT_LENGTH (REF_PAT_SAMPLES*REF_PAT_COORDS) - -// Current reference pattern was borrowed from OpenCV, to build a pattern with -// similar quality, a training process must be applied, as described in -// sections 4.2 and 4.3 of the original ORB paper. -const int ref_pat[REF_PAT_LENGTH] = { - 8,-3, 9,5, - 4,2, 7,-12, - -11,9, -8,2, - 7,-12, 12,-13, - 2,-13, 2,12, - 1,-7, 1,6, - -2,-10, -2,-4, - -13,-13, -11,-8, - -13,-3, -12,-9, - 10,4, 11,9, - -13,-8, -8,-9, - -11,7, -9,12, - 7,7, 12,6, - -4,-5, -3,0, - -13,2, -12,-3, - -9,0, -7,5, - 12,-6, 12,-1, - -3,6, -2,12, - -6,-13, -4,-8, - 11,-13, 12,-8, - 4,7, 5,1, - 5,-3, 10,-3, - 3,-7, 6,12, - -8,-7, -6,-2, - -2,11, -1,-10, - -13,12, -8,10, - -7,3, -5,-3, - -4,2, -3,7, - -10,-12, -6,11, - 5,-12, 6,-7, - 5,-6, 7,-1, - 1,0, 4,-5, - 9,11, 11,-13, - 4,7, 4,12, - 2,-1, 4,4, - -4,-12, -2,7, - -8,-5, -7,-10, - 4,11, 9,12, - 0,-8, 1,-13, - -13,-2, -8,2, - -3,-2, -2,3, - -6,9, -4,-9, - 8,12, 10,7, - 0,9, 1,3, - 7,-5, 11,-10, - -13,-6, -11,0, - 10,7, 12,1, - -6,-3, -6,12, - 10,-9, 12,-4, - -13,8, -8,-12, - -13,0, -8,-4, - 3,3, 7,8, - 5,7, 10,-7, - -1,7, 1,-12, - 3,-10, 5,6, - 2,-4, 3,-10, - -13,0, -13,5, - -13,-7, -12,12, - -13,3, -11,8, - -7,12, -4,7, - 6,-10, 12,8, - -9,-1, -7,-6, - -2,-5, 0,12, - -12,5, -7,5, - 3,-10, 8,-13, - -7,-7, -4,5, - -3,-2, -1,-7, - 2,9, 5,-11, - -11,-13, -5,-13, - -1,6, 0,-1, - 5,-3, 5,2, - -4,-13, -4,12, - -9,-6, -9,6, - -12,-10, -8,-4, - 10,2, 12,-3, - 7,12, 12,12, - -7,-13, -6,5, - -4,9, -3,4, - 7,-1, 12,2, - -7,6, -5,1, - -13,11, -12,5, - -3,7, -2,-6, - 7,-8, 12,-7, - -13,-7, -11,-12, - 1,-3, 12,12, - 2,-6, 3,0, - -4,3, -2,-13, - -1,-13, 1,9, - 7,1, 8,-6, - 1,-1, 3,12, - 9,1, 12,6, - -1,-9, -1,3, - -13,-13, -10,5, - 7,7, 10,12, - 12,-5, 12,9, - 6,3, 7,11, - 5,-13, 6,10, - 2,-12, 2,3, - 3,8, 4,-6, - 2,6, 12,-13, - 9,-12, 10,3, - -8,4, -7,9, - -11,12, -4,-6, - 1,12, 2,-8, - 6,-9, 7,-4, - 2,3, 3,-2, - 6,3, 11,0, - 3,-3, 8,-8, - 7,8, 9,3, - -11,-5, -6,-4, - -10,11, -5,10, - -5,-8, -3,12, - -10,5, -9,0, - 8,-1, 12,-6, - 4,-6, 6,-11, - -10,12, -8,7, - 4,-2, 6,7, - -2,0, -2,12, - -5,-8, -5,2, - 7,-6, 10,12, - -9,-13, -8,-8, - -5,-13, -5,-2, - 8,-8, 9,-13, - -9,-11, -9,0, - 1,-8, 1,-2, - 7,-4, 9,1, - -2,1, -1,-4, - 11,-6, 12,-11, - -12,-9, -6,4, - 3,7, 7,12, - 5,5, 10,8, - 0,-4, 2,8, - -9,12, -5,-13, - 0,7, 2,12, - -1,2, 1,7, - 5,11, 7,-9, - 3,5, 6,-8, - -13,-4, -8,9, - -5,9, -3,-3, - -4,-7, -3,-12, - 6,5, 8,0, - -7,6, -6,12, - -13,6, -5,-2, - 1,-10, 3,10, - 4,1, 8,-4, - -2,-2, 2,-13, - 2,-12, 12,12, - -2,-13, 0,-6, - 4,1, 9,3, - -6,-10, -3,-5, - -3,-13, -1,1, - 7,5, 12,-11, - 4,-2, 5,-7, - -13,9, -9,-5, - 7,1, 8,6, - 7,-8, 7,6, - -7,-4, -7,1, - -8,11, -7,-8, - -13,6, -12,-8, - 2,4, 3,9, - 10,-5, 12,3, - -6,-5, -6,7, - 8,-3, 9,-8, - 2,-12, 2,8, - -11,-2, -10,3, - -12,-13, -7,-9, - -11,0, -10,-5, - 5,-3, 11,8, - -2,-13, -1,12, - -1,-8, 0,9, - -13,-11, -12,-5, - -10,-2, -10,11, - -3,9, -2,-13, - 2,-3, 3,2, - -9,-13, -4,0, - -4,6, -3,-10, - -4,12, -2,-7, - -6,-11, -4,9, - 6,-3, 6,11, - -13,11, -5,5, - 11,11, 12,6, - 7,-5, 12,-2, - -1,12, 0,7, - -4,-8, -3,-2, - -7,1, -6,7, - -13,-12, -8,-13, - -7,-2, -6,-8, - -8,5, -6,-9, - -5,-1, -4,5, - -13,7, -8,10, - 1,5, 5,-13, - 1,0, 10,-13, - 9,12, 10,-1, - 5,-8, 10,-9, - -1,11, 1,-13, - -9,-3, -6,2, - -1,-10, 1,12, - -13,1, -8,-10, - 8,-11, 10,-6, - 2,-13, 3,-6, - 7,-13, 12,-9, - -10,-10, -5,-7, - -10,-8, -8,-13, - 4,-6, 8,5, - 3,12, 8,-13, - -4,2, -3,-3, - 5,-13, 10,-12, - 4,-13, 5,-1, - -9,9, -4,3, - 0,3, 3,-9, - -12,1, -6,1, - 3,2, 4,-8, - -10,-10, -10,9, - 8,-13, 12,12, - -8,-12, -6,-5, - 2,2, 3,7, - 10,6, 11,-8, - 6,8, 8,-12, - -7,10, -6,5, - -3,-9, -3,9, - -1,-13, -1,5, - -3,-7, -3,4, - -8,-2, -8,3, - 4,2, 12,12, - 2,-5, 3,11, - 6,-9, 11,-13, - 3,-1, 7,12, - 11,-1, 12,4, - -3,0, -3,6, - 4,-11, 4,12, - 2,-4, 2,1, - -10,-6, -8,1, - -13,7, -11,1, - -13,12, -11,-13, - 6,0, 11,-13, - 0,-1, 1,4, - -13,3, -9,-2, - -9,8, -6,-3, - -13,-6, -8,-2, - 5,-9, 8,10, - 2,7, 3,-9, - -1,-6, -1,-1, - 9,5, 11,-2, - 11,-3, 12,-8, - 3,0, 3,5, - -1,4, 0,10, - 3,-6, 4,5, - -13,0, -10,5, - 5,8, 12,11, - 8,9, 9,-6, - 7,-4, 8,-12, - -10,4, -10,9, - 7,3, 12,4, - 9,-7, 10,-2, - 7,0, 12,-2, - -1,-6, 0,-11, -}; - -template -void gaussian1D(T* out, const int dim, double sigma=0.0) -{ - if(!(sigma>0)) sigma = 0.25*dim; - - T sum = (T)0; - for(int i=0;i -void keep_features( - float* x_out, - float* y_out, - float* score_out, - float* size_out, - const float* x_in, - const float* y_in, - const float* score_in, - const unsigned* score_idx, - const float* size_in, - const unsigned n_feat) -{ - // Keep only the first n_feat features - for (unsigned f = 0; f < n_feat; f++) { - x_out[f] = x_in[score_idx[f]]; - y_out[f] = y_in[score_idx[f]]; - score_out[f] = score_in[f]; - if (size_in != nullptr && size_out != nullptr) - size_out[f] = size_in[score_idx[f]]; - } -} - -template -void harris_response( - float* x_out, - float* y_out, - float* score_out, - float* size_out, - const float* x_in, - const float* y_in, - const float* scl_in, - const unsigned total_feat, - unsigned* usable_feat, - const Array& image, - const unsigned block_size, - const float k_thr, - const unsigned patch_size) -{ - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); - for (unsigned f = 0; f < total_feat; f++) { - unsigned x, y; - float scl = 1.f; - if (use_scl) { - // Update x and y coordinates according to scale - scl = scl_in[f]; - x = (unsigned)round(x_in[f] * scl); - y = (unsigned)round(y_in[f] * scl); - } - else { - x = (unsigned)round(x_in[f]); - y = (unsigned)round(y_in[f]); - } - - // Round feature size to nearest odd integer - float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f; - - // Avoid keeping features that might be too wide and might not fit on - // the image, sqrt(2.f) is the radius when angle is 45 degrees and - // represents widest case possible - unsigned patch_r = ceil(size * sqrt(2.f) / 2.f); - if (x < patch_r || y < patch_r || x >= idims[1] - patch_r || y >= idims[0] - patch_r) - continue; - - unsigned r = block_size / 2; - - float ixx = 0.f, iyy = 0.f, ixy = 0.f; - unsigned block_size_sq = block_size * block_size; - for (unsigned k = 0; k < block_size_sq; k++) { - int i = k / block_size - r; - int j = k % block_size - r; - - // Calculate local x and y derivatives - float ix = image_ptr[(x+i+1) * idims[0] + y+j] - image_ptr[(x+i-1) * idims[0] + y+j]; - float iy = image_ptr[(x+i) * idims[0] + y+j+1] - image_ptr[(x+i) * idims[0] + y+j-1]; - - // Accumulate second order derivatives - ixx += ix*ix; - iyy += iy*iy; - ixy += ix*iy; - } - - unsigned idx = *usable_feat; - *usable_feat += 1; - float tr = ixx + iyy; - float det = ixx*iyy - ixy*ixy; - - // Calculate Harris responses - float resp = det - k_thr * (tr*tr); - - // Scale factor - // TODO: improve response scaling - float rscale = 0.001f; - rscale = rscale * rscale * rscale * rscale; - - x_out[idx] = x; - y_out[idx] = y; - score_out[idx] = resp * rscale; - if (use_scl) - size_out[idx] = size; - } -} - -template -void centroid_angle( - const float* x_in, - const float* y_in, - float* orientation_out, - const unsigned total_feat, - const Array& image, - const unsigned patch_size) -{ - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); - for (unsigned f = 0; f < total_feat; f++) { - unsigned x = (unsigned)round(x_in[f]); - unsigned y = (unsigned)round(y_in[f]); - - unsigned r = patch_size / 2; - if (x < r || y < r || x > idims[1] - r || y > idims[0] - r) - continue; - - T m01 = (T)0, m10 = (T)0; - unsigned patch_size_sq = patch_size * patch_size; - for (unsigned k = 0; k < patch_size_sq; k++) { - int i = k / patch_size - r; - int j = k % patch_size - r; - - // Calculate first order moments - T p = image_ptr[(x+i) * idims[0] + y+j]; - m01 += j * p; - m10 += i * p; - } - - float angle = atan2(m01, m10); - orientation_out[f] = angle; - } -} - -template -inline T get_pixel( - unsigned x, - unsigned y, - const float ori, - const unsigned size, - const int dist_x, - const int dist_y, - const Array& image, - const unsigned patch_size) -{ - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); - float ori_sin = sin(ori); - float ori_cos = cos(ori); - float patch_scl = (float)size / (float)patch_size; - - // Calculate point coordinates based on orientation and size - x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin); - y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos); - - return image_ptr[x * idims[0] + y]; -} - -template -void extract_orb( - unsigned* desc_out, - const unsigned n_feat, - float* x_in_out, - float* y_in_out, - const float* ori_in, - float* size_out, - const Array& image, - const float scl, - const unsigned patch_size) -{ - const af::dim4 idims = image.dims(); - for (unsigned f = 0; f < n_feat; f++) { - unsigned x = (unsigned)round(x_in_out[f]); - unsigned y = (unsigned)round(y_in_out[f]); - float ori = ori_in[f]; - unsigned size = patch_size; - - unsigned r = ceil(patch_size * sqrt(2.f) / 2.f); - if (x < r || y < r || x >= idims[1] - r || y >= idims[0] - r) - continue; - - // Descriptor fixed at 256 bits for now - // Storing descriptor as a vector of 8 x 32-bit unsigned numbers - for (unsigned i = 0; i < 8; i++) { - unsigned v = 0; - - // j < 32 for 256 bits descriptor - for (unsigned j = 0; j < 32; j++) { - // Get position from distribution pattern and values of points p1 and p2 - int dist_x = ref_pat[i*32*4 + j*4]; - int dist_y = ref_pat[i*32*4 + j*4+1]; - T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); - - dist_x = ref_pat[i*32*4 + j*4+2]; - dist_y = ref_pat[i*32*4 + j*4+3]; - T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); - - // Calculate bit based on p1 and p2 and shifts it to correct position - v |= (p1 < p2) << j; - } - - // Store 32 bits of descriptor - desc_out[f * 8 + i] += v; - } - - x_in_out[f] = round(x * scl); - y_in_out[f] = round(y * scl); - size_out[f] = patch_size * scl; - } -} - - - template unsigned orb(Array &x, Array &y, Array &score, Array &ori, @@ -542,6 +36,8 @@ unsigned orb(Array &x, Array &y, const float scl_fctr, const unsigned levels, const bool blur_img) { + image.eval(); + getQueue().sync(); unsigned patch_size = REF_PAT_SIZE; @@ -611,6 +107,9 @@ unsigned orb(Array &x, Array &y, prev_img = lvl_img; prev_ldims = lvl_img.dims(); } + prev_img.eval(); + lvl_img.eval(); + getQueue().sync(); Array x_feat = createEmptyArray(dim4()); @@ -628,7 +127,6 @@ unsigned orb(Array &x, Array &y, unsigned lvl_feat = fast(x_feat, y_feat, score_feat, lvl_img, fast_thr, 9, 1, 0.15f, edge); - if (lvl_feat == 0) { continue; } @@ -643,7 +141,7 @@ unsigned orb(Array &x, Array &y, // Calculate Harris responses // Good block_size >= 7 (must be an odd number) unsigned usable_feat = 0; - harris_response(h_x_harris, h_y_harris, h_score_harris, nullptr, + kernel::harris_response(h_x_harris, h_y_harris, h_score_harris, nullptr, h_x_feat, h_y_feat, nullptr, lvl_feat, &usable_feat, lvl_img, @@ -653,7 +151,6 @@ unsigned orb(Array &x, Array &y, memFree(h_x_harris); memFree(h_y_harris); memFree(h_score_harris); - continue; } @@ -664,13 +161,13 @@ unsigned orb(Array &x, Array &y, Array harris_idx = createEmptyArray(af::dim4()); sort_index(harris_sorted, harris_idx, score_harris, 0); + getQueue().sync(); usable_feat = std::min(usable_feat, lvl_best[i]); if (usable_feat == 0) { memFree(h_x_harris); memFree(h_y_harris); - continue; } @@ -679,7 +176,7 @@ unsigned orb(Array &x, Array &y, float* h_score_lvl = memAlloc(usable_feat); // Keep only features with higher Harris responses - keep_features(h_x_lvl, h_y_lvl, h_score_lvl, nullptr, + kernel::keep_features(h_x_lvl, h_y_lvl, h_score_lvl, nullptr, h_x_harris, h_y_harris, harris_sorted.get(), harris_idx.get(), nullptr, usable_feat); @@ -690,7 +187,7 @@ unsigned orb(Array &x, Array &y, float* h_size_lvl = memAlloc(usable_feat); // Compute orientation of features - centroid_angle(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat, + kernel::centroid_angle(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat, lvl_img, patch_size); Array lvl_filt = createEmptyArray(dim4()); @@ -701,21 +198,24 @@ unsigned orb(Array &x, Array &y, h_gauss = memAlloc(gauss_dims[0]); gaussian1D(h_gauss, gauss_dims[0], 2.f); gauss_filter = createDeviceDataArray(gauss_dims, h_gauss); + gauss_filter.eval(); } // Filter level image with Gaussian kernel to reduce noise sensitivity lvl_filt = convolve2(lvl_img, gauss_filter, gauss_filter); } + lvl_filt.eval(); + getQueue().sync(); // Compute ORB descriptors unsigned* h_desc_lvl = memAlloc(usable_feat * 8); memset(h_desc_lvl, 0, usable_feat * 8 * sizeof(unsigned)); if (blur_img) - extract_orb(h_desc_lvl, usable_feat, + kernel::extract_orb(h_desc_lvl, usable_feat, h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl, lvl_filt, lvl_scl, patch_size); else - extract_orb(h_desc_lvl, usable_feat, + kernel::extract_orb(h_desc_lvl, usable_feat, h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl, lvl_img, lvl_scl, patch_size); diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index 4d96a37fbb..9474c792f3 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -11,11 +11,16 @@ #include #include #include +#include +#include #include #include #include #include #include +#include +#include +#include #ifdef _WIN32 #include @@ -175,6 +180,22 @@ CPUInfo::CPUInfo() namespace cpu { +unsigned getMaxJitSize() +{ + const int MAX_JIT_LEN = 20; + + static int length = 0; + if (length == 0) { + std::string env_var = getEnvVar("AF_CPU_MAX_JIT_LEN"); + if (!env_var.empty()) { + length = std::stoi(env_var); + } else { + length = MAX_JIT_LEN; + } + } + return length; +} + int getBackend() { return AF_BACKEND_CPU; @@ -194,14 +215,29 @@ static const std::string get_system(void) #endif } -std::string getInfo() +// http://stackoverflow.com/questions/216823/whats-the-best-way-to-trim-stdstring/217605#217605 +// trim from start +static inline std::string <rim(std::string &s) +{ + s.erase(s.begin(), std::find_if(s.begin(), s.end(), + std::not1(std::ptr_fun(std::isspace)))); + return s; +} + +std::string getDeviceInfo() { std::ostringstream info; static CPUInfo cinfo; info << "ArrayFire v" << AF_VERSION << " (CPU, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; - info << string("[0] ") << cinfo.vendor() <<": " << cinfo.model() << " "; + std::string model = cinfo.model(); + size_t memMB = getDeviceMemorySize(getActiveDeviceId()) / 1048576; + info << string("[0] ") << cinfo.vendor() <<": " << ltrim(model); + + if(memMB) info << ", " << memMB << " MB, "; + else info << ", Unknown MB, "; + info << "Max threads("<< cinfo.threads()<<") "; #ifndef NDEBUG info << AF_COMPILER_STR; @@ -234,11 +270,11 @@ int getDeviceCount() int setDevice(int device) { static bool flag; - if(!flag) { - printf("WARNING: af_set_device not supported for CPU\n"); + if(!flag && device != 0) { + printf("WARNING af_set_device(device): device can only be 0 for CPU\n"); flag = 1; } - return 1; + return 0; } int getActiveDeviceId() @@ -246,9 +282,27 @@ int getActiveDeviceId() return 0; } +size_t getDeviceMemorySize(int device) +{ + return common::getHostMemorySize(); +} + +size_t getHostMemorySize() +{ + return common::getHostMemorySize(); +} + +static const int MAX_QUEUES = 1; + + +queue& getQueue(int idx) { + static std::array queues; + return queues[idx]; +} + void sync(int device) { - // Nothing here + getQueue().sync(); } } diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp index 2e52cd13a6..7caddccc72 100644 --- a/src/backend/cpu/platform.hpp +++ b/src/backend/cpu/platform.hpp @@ -7,12 +7,16 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once + #include namespace cpu { + class queue; + int getBackend(); - std::string getInfo(); + std::string getDeviceInfo(); bool isDoubleSupported(int device); @@ -24,5 +28,13 @@ namespace cpu { int getActiveDeviceId(); + size_t getDeviceMemorySize(int device); + + size_t getHostMemorySize(); + void sync(int device); + + queue& getQueue(int idx = 0); + + unsigned getMaxJitSize(); } diff --git a/src/backend/cpu/plot.cpp b/src/backend/cpu/plot.cpp index 9de1993f2d..2ab69643c8 100644 --- a/src/backend/cpu/plot.cpp +++ b/src/backend/cpu/plot.cpp @@ -12,37 +12,40 @@ #include #include #include -#include #include -#include -#include +#include +#include using af::dim4; namespace cpu { - template - void copy_plot(const Array &P, fg::Plot* plot) - { - CheckGL("Before CopyArrayToVBO"); - - glBindBuffer(GL_ARRAY_BUFFER, plot->vbo()); - glBufferSubData(GL_ARRAY_BUFFER, 0, plot->size(), P.get()); - glBindBuffer(GL_ARRAY_BUFFER, 0); - - CheckGL("In CopyArrayToVBO"); - } - - #define INSTANTIATE(T) \ - template void copy_plot(const Array &P, fg::Plot* plot); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) + +template +void copy_plot(const Array &P, fg::Plot* plot) +{ + P.eval(); + getQueue().sync(); + CheckGL("Before CopyArrayToVBO"); + + glBindBuffer(GL_ARRAY_BUFFER, plot->vbo()); + glBufferSubData(GL_ARRAY_BUFFER, 0, plot->size(), P.get()); + glBindBuffer(GL_ARRAY_BUFFER, 0); + + CheckGL("In CopyArrayToVBO"); +} + +#define INSTANTIATE(T) \ + template void copy_plot(const Array &P, fg::Plot* plot); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) + } #endif // WITH_GRAPHICS diff --git a/src/backend/cpu/plot3.cpp b/src/backend/cpu/plot3.cpp index c0e26aaa34..515fe0336c 100644 --- a/src/backend/cpu/plot3.cpp +++ b/src/backend/cpu/plot3.cpp @@ -12,37 +12,40 @@ #include #include #include -#include #include -#include -#include +#include +#include using af::dim4; namespace cpu { - template - void copy_plot3(const Array &P, fg::Plot3* plot3) - { - CheckGL("Before CopyArrayToVBO"); - - glBindBuffer(GL_ARRAY_BUFFER, plot3->vbo()); - glBufferSubData(GL_ARRAY_BUFFER, 0, plot3->size(), P.get()); - glBindBuffer(GL_ARRAY_BUFFER, 0); - - CheckGL("In CopyArrayToVBO"); - } - - #define INSTANTIATE(T) \ - template void copy_plot3(const Array &P, fg::Plot3* plot3); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) + +template +void copy_plot3(const Array &P, fg::Plot3* plot3) +{ + P.eval(); + getQueue().sync(); + CheckGL("Before CopyArrayToVBO"); + + glBindBuffer(GL_ARRAY_BUFFER, plot3->vbo()); + glBufferSubData(GL_ARRAY_BUFFER, 0, plot3->size(), P.get()); + glBindBuffer(GL_ARRAY_BUFFER, 0); + + CheckGL("In CopyArrayToVBO"); +} + +#define INSTANTIATE(T) \ + template void copy_plot3(const Array &P, fg::Plot3* plot3); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) + } #endif // WITH_GRAPHICS diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp index d1c3e233af..f8dbfa2013 100644 --- a/src/backend/cpu/qr.cpp +++ b/src/backend/cpu/qr.cpp @@ -11,28 +11,23 @@ #include #if defined(WITH_CPU_LINEAR_ALGEBRA) - #include #include -#include #include #include #include - #include +#include +#include namespace cpu { template -using geqrf_func_def = int (*)(ORDER_TYPE, int, int, - T*, int, - T*); +using geqrf_func_def = int (*)(ORDER_TYPE, int, int, T*, int, T*); template -using gqr_func_def = int (*)(ORDER_TYPE, int, int, int, - T*, int, - const T*); +using gqr_func_def = int (*)(ORDER_TYPE, int, int, int, T*, int, const T*); #define QR_FUNC_DEF( FUNC ) \ template FUNC##_func_def FUNC##_func(); @@ -64,9 +59,14 @@ GQR_FUNC(gqr , cdouble, zungqr) template void qr(Array &q, Array &r, Array &t, const Array &in) { + q.eval(); + r.eval(); + t.eval(); + in.eval(); + dim4 iDims = in.dims(); - int M = iDims[0]; - int N = iDims[1]; + int M = iDims[0]; + int N = iDims[1]; q = padArray(in, dim4(M, max(M, N))); q.resetDims(iDims); @@ -78,39 +78,31 @@ void qr(Array &q, Array &r, Array &t, const Array &in) triangle(r, q); - gqr_func()(AF_LAPACK_COL_MAJOR, - M, M, min(M, N), - q.get(), q.strides()[1], - t.get()); - + auto func = [=] (Array q, Array t, int M, int N) { + gqr_func()(AF_LAPACK_COL_MAJOR, M, M, min(M, N), q.get(), q.strides()[1], t.get()); + }; q.resetDims(dim4(M, M)); + getQueue().enqueue(func, q, t, M, N); } template Array qr_inplace(Array &in) { - dim4 iDims = in.dims(); - int M = iDims[0]; - int N = iDims[1]; + in.eval(); + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; Array t = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); - geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, - in.get(), in.strides()[1], - t.get()); + auto func = [=] (Array in, Array t, int M, int N) { + geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, in.get(), in.strides()[1], t.get()); + }; + getQueue().enqueue(func, in, t, M, N); return t; } -#define INSTANTIATE_QR(T) \ - template Array qr_inplace(Array &in); \ - template void qr(Array &q, Array &r, Array &t, const Array &in); - -INSTANTIATE_QR(float) -INSTANTIATE_QR(cfloat) -INSTANTIATE_QR(double) -INSTANTIATE_QR(cdouble) - } #else @@ -130,6 +122,13 @@ Array qr_inplace(Array &in) AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); } +} + +#endif + +namespace cpu +{ + #define INSTANTIATE_QR(T) \ template Array qr_inplace(Array &in); \ template void qr(Array &q, Array &r, Array &t, const Array &in); @@ -140,5 +139,3 @@ INSTANTIATE_QR(double) INSTANTIATE_QR(cdouble) } - -#endif diff --git a/src/backend/cpu/queue.hpp b/src/backend/cpu/queue.hpp new file mode 100644 index 0000000000..2f32b4d852 --- /dev/null +++ b/src/backend/cpu/queue.hpp @@ -0,0 +1,93 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +//FIXME: Is there a better way to check for std::future not being supported ? +#if defined(AF_DISABLE_CPU_ASYNC) || (defined(__GNUC__) && (__GCC_ATOMIC_INT_LOCK_FREE < 2 || __GCC_ATOMIC_POINTER_LOCK_FREE < 2)) + +#include +using std::function; +#include +#define __SYNCHRONOUS_ARCH 1 +class queue_impl +{ +public: + template + void enqueue(const F func, Args... args) const { + AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL); + } + + void sync() const { + AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL); + } + + bool is_worker() const { + AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL); + return false; + } + +}; + +#else + +#include +#define __SYNCHRONOUS_ARCH 0 +typedef async_queue queue_impl; + +#endif + +#pragma once + +namespace cpu { + +/// Wraps the async_queue class +class queue +{ +public: + queue() + : + count(0), + sync_calls( __SYNCHRONOUS_ARCH == 1 || getEnvVar("AF_SYNCHRONOUS_CALLS") == "1") + {} + + template + void enqueue(const F func, Args... args) + { + count++; + if(sync_calls) { func( args... ); } + else { aQueue.enqueue( func, args... ); } +#ifndef NDEBUG + sync(); +#else + if (checkMemoryLimit() || count >= 25) { + sync(); + } +#endif + } + + void sync() + { + count = 0; + if(!sync_calls) aQueue.sync(); + } + + bool is_worker() const + { + return (!sync_calls) ? aQueue.is_worker() : false; + } + + private: + int count; + const bool sync_calls; + queue_impl aQueue; +}; + +} diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp index e93fdf9b8f..06cbca34d7 100644 --- a/src/backend/cpu/random.cpp +++ b/src/backend/cpu/random.cpp @@ -7,117 +7,23 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include -#include -#include -#include -#include -#include #include #include #include #include #include +#include +#include +#include namespace cpu { -using namespace std; - -template -using is_arithmetic_t = typename enable_if< is_arithmetic::value, function>::type; -template -using is_complex_t = typename enable_if< is_complex::value, function>::type; -template -using is_floating_point_t = typename enable_if< is_floating_point::value, function>::type; - -template -is_arithmetic_t -urand(GenType &generator) -{ - typedef typename conditional< is_floating_point::value, - uniform_real_distribution, -#if OS_WIN - uniform_int_distribution>::type dist; -#else - uniform_int_distribution> ::type dist; -#endif - return bind(dist(), generator); -} - -template -is_complex_t -urand(GenType &generator) -{ - auto func = urand(generator); - return [func] () { return T(func(), func());}; -} - -template -is_floating_point_t -nrand(GenType &generator) -{ - return bind(normal_distribution(), generator); -} - -template -is_complex_t -nrand(GenType &generator) -{ - auto func = nrand(generator); - return [func] () { return T(func(), func());}; -} - -static mt19937 generator; -static unsigned long long gen_seed = 0; -static bool is_first = true; -#define GLOBAL 1 - -template -Array randn(const af::dim4 &dims) -{ - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; - } - - static auto gen = nrand(generator); - - if (my_seed != gen_seed) { - gen = nrand(generator); - my_seed = gen_seed; - } - - Array outArray = createEmptyArray(dims); - T *outPtr = outArray.get(); - for (int i = 0; i < (int)outArray.elements(); i++) { - outPtr[i] = gen(); - } - return outArray; -} - template Array randu(const af::dim4 &dims) { - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; - } - - static auto gen = urand(generator); - - if (my_seed != gen_seed) { - gen = urand(generator); - my_seed = gen_seed; - } - Array outArray = createEmptyArray(dims); - T *outPtr = outArray.get(); - for (int i = 0; i < (int)outArray.elements(); i++) { - outPtr[i] = gen(); - } + getQueue().enqueue(kernel::randu, outArray); return outArray; } @@ -133,9 +39,18 @@ INSTANTIATE_UNIFORM(uint) INSTANTIATE_UNIFORM(intl) INSTANTIATE_UNIFORM(uintl) INSTANTIATE_UNIFORM(uchar) +INSTANTIATE_UNIFORM(char) INSTANTIATE_UNIFORM(short) INSTANTIATE_UNIFORM(ushort) +template +Array randn(const af::dim4 &dims) +{ + Array outArray = createEmptyArray(dims); + getQueue().enqueue(kernel::randn, outArray); + return outArray; +} + #define INSTANTIATE_NORMAL(T) \ template Array randn(const af::dim4 &dims); @@ -144,41 +59,17 @@ INSTANTIATE_NORMAL(double) INSTANTIATE_NORMAL(cfloat) INSTANTIATE_NORMAL(cdouble) - -template<> -Array randu(const af::dim4 &dims) -{ - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; - } - - static auto gen = urand(generator); - - if (my_seed != gen_seed) { - gen = urand(generator); - my_seed = gen_seed; - } - - Array outArray = createEmptyArray(dims); - char *outPtr = outArray.get(); - for (int i = 0; i < (int)outArray.elements(); i++) { - outPtr[i] = gen() > 0.5; - } - return outArray; -} - void setSeed(const uintl seed) { - generator.seed(seed); - is_first = false; - gen_seed = seed; + getQueue().enqueue(kernel::setSeed, seed); } uintl getSeed() { - return gen_seed; + uintl seed = 0; + getQueue().enqueue(kernel::getSeedPtr, &seed); + getQueue().sync(); + return seed; } } diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp index eabf3a1ee1..e91ba1e241 100644 --- a/src/backend/cpu/range.cpp +++ b/src/backend/cpu/range.cpp @@ -14,74 +14,46 @@ #include #include #include +#include +#include +#include namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - template - void range(T *out, const dim4 &dims, const dim4 &strides) - { - for(dim_t w = 0; w < dims[3]; w++) { - dim_t offW = w * strides[3]; - for(dim_t z = 0; z < dims[2]; z++) { - dim_t offWZ = offW + z * strides[2]; - for(dim_t y = 0; y < dims[1]; y++) { - dim_t offWZY = offWZ + y * strides[1]; - for(dim_t x = 0; x < dims[0]; x++) { - dim_t id = offWZY + x; - if(dim == 0) { - out[id] = x; - } else if(dim == 1) { - out[id] = y; - } else if(dim == 2) { - out[id] = z; - } else if(dim == 3) { - out[id] = w; - } - } - } - } - } - } - - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - Array range(const dim4& dims, const int seq_dim) - { - // Set dimension along which the sequence should be - // Other dimensions are simply tiled - int _seq_dim = seq_dim; - if(seq_dim < 0) { - _seq_dim = 0; // column wise sequence - } - - Array out = createEmptyArray(dims); - switch(_seq_dim) { - case 0: range(out.get(), out.dims(), out.strides()); break; - case 1: range(out.get(), out.dims(), out.strides()); break; - case 2: range(out.get(), out.dims(), out.strides()); break; - case 3: range(out.get(), out.dims(), out.strides()); break; - default : AF_ERROR("Invalid rep selection", AF_ERR_ARG); - } +template +Array range(const dim4& dims, const int seq_dim) +{ + // Set dimension along which the sequence should be + // Other dimensions are simply tiled + int _seq_dim = seq_dim; + if(seq_dim < 0) { + _seq_dim = 0; // column wise sequence + } - return out; + Array out = createEmptyArray(dims); + switch(_seq_dim) { + case 0: getQueue().enqueue(kernel::range, out); break; + case 1: getQueue().enqueue(kernel::range, out); break; + case 2: getQueue().enqueue(kernel::range, out); break; + case 3: getQueue().enqueue(kernel::range, out); break; + default : AF_ERROR("Invalid rep selection", AF_ERR_ARG); } + return out; +} + #define INSTANTIATE(T) \ template Array range(const af::dim4 &dims, const int seq_dims); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(ushort) - INSTANTIATE(short) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(ushort) +INSTANTIATE(short) + } diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp index a38d06118c..2d4d18e682 100644 --- a/src/backend/cpu/reduce.cpp +++ b/src/backend/cpu/reduce.cpp @@ -15,6 +15,9 @@ #include #include #include +#include +#include +#include using af::dim4; @@ -34,217 +37,179 @@ struct Binary namespace cpu { - template - struct reduce_dim - { - void operator()(To *out, const dim4 &ostrides, const dim4 &odims, - const Ti *in , const dim4 &istrides, const dim4 &idims, - const int dim, bool change_nan, double nanval) - { - static const int D1 = D - 1; - static reduce_dim reduce_dim_next; - for (dim_t i = 0; i < odims[D1]; i++) { - reduce_dim_next(out + i * ostrides[D1], - ostrides, odims, - in + i * istrides[D1], - istrides, idims, - dim, change_nan, nanval); - } - } - }; - template - struct reduce_dim - { +template +using reduce_dim_func = std::function, const dim_t, + const Array, const dim_t, + const int, bool, double)>; - Transform transform; - Binary reduce; - void operator()(To *out, const dim4 &ostrides, const dim4 &odims, - const Ti *in , const dim4 &istrides, const dim4 &idims, - const int dim, bool change_nan, double nanval) - { - dim_t stride = istrides[dim]; - - To out_val = reduce.init(); - for (dim_t i = 0; i < idims[dim]; i++) { - To in_val = transform(in[i * stride]); - if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; - out_val = reduce(in_val, out_val); - } - - *out = out_val; - } - }; - - template - using reduce_dim_func = std::function; +template +Array reduce(const Array &in, const int dim, bool change_nan, double nanval) +{ + dim4 odims = in.dims(); + odims[dim] = 1; + in.eval(); - template - Array reduce(const Array &in, const int dim, bool change_nan, double nanval) - { - dim4 odims = in.dims(); - odims[dim] = 1; + Array out = createEmptyArray(odims); + static const reduce_dim_func reduce_funcs[4] = { kernel::reduce_dim() + , kernel::reduce_dim() + , kernel::reduce_dim() + , kernel::reduce_dim()}; - Array out = createEmptyArray(odims); - static reduce_dim_func reduce_funcs[4] = { reduce_dim() - , reduce_dim() - , reduce_dim() - , reduce_dim()}; + getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval); - reduce_funcs[in.ndims() - 1](out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim, - change_nan, nanval); + return out; +} - return out; - } +template +To reduce_all(const Array &in, bool change_nan, double nanval) +{ + in.eval(); + getQueue().sync(); - template - To reduce_all(const Array &in, bool change_nan, double nanval) - { - Transform transform; - Binary reduce; + Transform transform; + Binary reduce; - To out = reduce.init(); + To out = reduce.init(); - // Decrement dimension of select dimension - af::dim4 dims = in.dims(); - af::dim4 strides = in.strides(); - const Ti *inPtr = in.get(); + // Decrement dimension of select dimension + af::dim4 dims = in.dims(); + af::dim4 strides = in.strides(); + const Ti *inPtr = in.get(); - for(dim_t l = 0; l < dims[3]; l++) { - dim_t off3 = l * strides[3]; + for(dim_t l = 0; l < dims[3]; l++) { + dim_t off3 = l * strides[3]; - for(dim_t k = 0; k < dims[2]; k++) { - dim_t off2 = k * strides[2]; + for(dim_t k = 0; k < dims[2]; k++) { + dim_t off2 = k * strides[2]; - for(dim_t j = 0; j < dims[1]; j++) { - dim_t off1 = j * strides[1]; + for(dim_t j = 0; j < dims[1]; j++) { + dim_t off1 = j * strides[1]; - for(dim_t i = 0; i < dims[0]; i++) { - dim_t idx = i + off1 + off2 + off3; + for(dim_t i = 0; i < dims[0]; i++) { + dim_t idx = i + off1 + off2 + off3; - To in_val = transform(inPtr[idx]); - if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; - out = reduce(in_val, out); - } + To in_val = transform(inPtr[idx]); + if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; + out = reduce(in_val, out); } } } - - return out; } + return out; +} + #define INSTANTIATE(ROp, Ti, To) \ template Array reduce(const Array &in, const int dim, \ bool change_nan, double nanval); \ template To reduce_all(const Array &in, \ bool change_nan, double nanval); - //min - INSTANTIATE(af_min_t, float , float ) - INSTANTIATE(af_min_t, double , double ) - INSTANTIATE(af_min_t, cfloat , cfloat ) - INSTANTIATE(af_min_t, cdouble, cdouble) - INSTANTIATE(af_min_t, int , int ) - INSTANTIATE(af_min_t, uint , uint ) - INSTANTIATE(af_min_t, intl , intl ) - INSTANTIATE(af_min_t, uintl , uintl ) - INSTANTIATE(af_min_t, char , char ) - INSTANTIATE(af_min_t, uchar , uchar ) - INSTANTIATE(af_min_t, short , short ) - INSTANTIATE(af_min_t, ushort , ushort ) - - //max - INSTANTIATE(af_max_t, float , float ) - INSTANTIATE(af_max_t, double , double ) - INSTANTIATE(af_max_t, cfloat , cfloat ) - INSTANTIATE(af_max_t, cdouble, cdouble) - INSTANTIATE(af_max_t, int , int ) - INSTANTIATE(af_max_t, uint , uint ) - INSTANTIATE(af_max_t, intl , intl ) - INSTANTIATE(af_max_t, uintl , uintl ) - INSTANTIATE(af_max_t, char , char ) - INSTANTIATE(af_max_t, uchar , uchar ) - INSTANTIATE(af_max_t, short , short ) - INSTANTIATE(af_max_t, ushort , ushort ) - - //sum - INSTANTIATE(af_add_t, float , float ) - INSTANTIATE(af_add_t, double , double ) - INSTANTIATE(af_add_t, cfloat , cfloat ) - INSTANTIATE(af_add_t, cdouble, cdouble) - INSTANTIATE(af_add_t, int , int ) - INSTANTIATE(af_add_t, int , float ) - INSTANTIATE(af_add_t, uint , uint ) - INSTANTIATE(af_add_t, uint , float ) - INSTANTIATE(af_add_t, intl , intl ) - INSTANTIATE(af_add_t, intl , double ) - INSTANTIATE(af_add_t, uintl , uintl ) - INSTANTIATE(af_add_t, uintl , double ) - INSTANTIATE(af_add_t, char , int ) - INSTANTIATE(af_add_t, char , float ) - INSTANTIATE(af_add_t, uchar , uint ) - INSTANTIATE(af_add_t, uchar , float ) - INSTANTIATE(af_add_t, short , int ) - INSTANTIATE(af_add_t, short , float ) - INSTANTIATE(af_add_t, ushort , uint ) - INSTANTIATE(af_add_t, ushort , float ) - - //mul - INSTANTIATE(af_mul_t, float , float ) - INSTANTIATE(af_mul_t, double , double ) - INSTANTIATE(af_mul_t, cfloat , cfloat ) - INSTANTIATE(af_mul_t, cdouble, cdouble) - INSTANTIATE(af_mul_t, int , int ) - INSTANTIATE(af_mul_t, uint , uint ) - INSTANTIATE(af_mul_t, intl , intl ) - INSTANTIATE(af_mul_t, uintl , uintl ) - INSTANTIATE(af_mul_t, char , int ) - INSTANTIATE(af_mul_t, uchar , uint ) - INSTANTIATE(af_mul_t, short , int ) - INSTANTIATE(af_mul_t, ushort , uint ) - - // count - INSTANTIATE(af_notzero_t, float , uint) - INSTANTIATE(af_notzero_t, double , uint) - INSTANTIATE(af_notzero_t, cfloat , uint) - INSTANTIATE(af_notzero_t, cdouble, uint) - INSTANTIATE(af_notzero_t, int , uint) - INSTANTIATE(af_notzero_t, uint , uint) - INSTANTIATE(af_notzero_t, intl , uint) - INSTANTIATE(af_notzero_t, uintl , uint) - INSTANTIATE(af_notzero_t, char , uint) - INSTANTIATE(af_notzero_t, uchar , uint) - INSTANTIATE(af_notzero_t, short , uint) - INSTANTIATE(af_notzero_t, ushort , uint) - - //anytrue - INSTANTIATE(af_or_t, float , char) - INSTANTIATE(af_or_t, double , char) - INSTANTIATE(af_or_t, cfloat , char) - INSTANTIATE(af_or_t, cdouble, char) - INSTANTIATE(af_or_t, int , char) - INSTANTIATE(af_or_t, uint , char) - INSTANTIATE(af_or_t, intl , char) - INSTANTIATE(af_or_t, uintl , char) - INSTANTIATE(af_or_t, char , char) - INSTANTIATE(af_or_t, uchar , char) - INSTANTIATE(af_or_t, short , char) - INSTANTIATE(af_or_t, ushort , char) - - //alltrue - INSTANTIATE(af_and_t, float , char) - INSTANTIATE(af_and_t, double , char) - INSTANTIATE(af_and_t, cfloat , char) - INSTANTIATE(af_and_t, cdouble, char) - INSTANTIATE(af_and_t, int , char) - INSTANTIATE(af_and_t, uint , char) - INSTANTIATE(af_and_t, intl , char) - INSTANTIATE(af_and_t, uintl , char) - INSTANTIATE(af_and_t, char , char) - INSTANTIATE(af_and_t, uchar , char) - INSTANTIATE(af_and_t, short , char) - INSTANTIATE(af_and_t, ushort , char) +//min +INSTANTIATE(af_min_t, float , float ) +INSTANTIATE(af_min_t, double , double ) +INSTANTIATE(af_min_t, cfloat , cfloat ) +INSTANTIATE(af_min_t, cdouble, cdouble) +INSTANTIATE(af_min_t, int , int ) +INSTANTIATE(af_min_t, uint , uint ) +INSTANTIATE(af_min_t, intl , intl ) +INSTANTIATE(af_min_t, uintl , uintl ) +INSTANTIATE(af_min_t, char , char ) +INSTANTIATE(af_min_t, uchar , uchar ) +INSTANTIATE(af_min_t, short , short ) +INSTANTIATE(af_min_t, ushort , ushort ) + +//max +INSTANTIATE(af_max_t, float , float ) +INSTANTIATE(af_max_t, double , double ) +INSTANTIATE(af_max_t, cfloat , cfloat ) +INSTANTIATE(af_max_t, cdouble, cdouble) +INSTANTIATE(af_max_t, int , int ) +INSTANTIATE(af_max_t, uint , uint ) +INSTANTIATE(af_max_t, intl , intl ) +INSTANTIATE(af_max_t, uintl , uintl ) +INSTANTIATE(af_max_t, char , char ) +INSTANTIATE(af_max_t, uchar , uchar ) +INSTANTIATE(af_max_t, short , short ) +INSTANTIATE(af_max_t, ushort , ushort ) + +//sum +INSTANTIATE(af_add_t, float , float ) +INSTANTIATE(af_add_t, double , double ) +INSTANTIATE(af_add_t, cfloat , cfloat ) +INSTANTIATE(af_add_t, cdouble, cdouble) +INSTANTIATE(af_add_t, int , int ) +INSTANTIATE(af_add_t, int , float ) +INSTANTIATE(af_add_t, uint , uint ) +INSTANTIATE(af_add_t, uint , float ) +INSTANTIATE(af_add_t, intl , intl ) +INSTANTIATE(af_add_t, intl , double ) +INSTANTIATE(af_add_t, uintl , uintl ) +INSTANTIATE(af_add_t, uintl , double ) +INSTANTIATE(af_add_t, char , int ) +INSTANTIATE(af_add_t, char , float ) +INSTANTIATE(af_add_t, uchar , uint ) +INSTANTIATE(af_add_t, uchar , float ) +INSTANTIATE(af_add_t, short , int ) +INSTANTIATE(af_add_t, short , float ) +INSTANTIATE(af_add_t, ushort , uint ) +INSTANTIATE(af_add_t, ushort , float ) + +//mul +INSTANTIATE(af_mul_t, float , float ) +INSTANTIATE(af_mul_t, double , double ) +INSTANTIATE(af_mul_t, cfloat , cfloat ) +INSTANTIATE(af_mul_t, cdouble, cdouble) +INSTANTIATE(af_mul_t, int , int ) +INSTANTIATE(af_mul_t, uint , uint ) +INSTANTIATE(af_mul_t, intl , intl ) +INSTANTIATE(af_mul_t, uintl , uintl ) +INSTANTIATE(af_mul_t, char , int ) +INSTANTIATE(af_mul_t, uchar , uint ) +INSTANTIATE(af_mul_t, short , int ) +INSTANTIATE(af_mul_t, ushort , uint ) + +// count +INSTANTIATE(af_notzero_t, float , uint) +INSTANTIATE(af_notzero_t, double , uint) +INSTANTIATE(af_notzero_t, cfloat , uint) +INSTANTIATE(af_notzero_t, cdouble, uint) +INSTANTIATE(af_notzero_t, int , uint) +INSTANTIATE(af_notzero_t, uint , uint) +INSTANTIATE(af_notzero_t, intl , uint) +INSTANTIATE(af_notzero_t, uintl , uint) +INSTANTIATE(af_notzero_t, char , uint) +INSTANTIATE(af_notzero_t, uchar , uint) +INSTANTIATE(af_notzero_t, short , uint) +INSTANTIATE(af_notzero_t, ushort , uint) + +//anytrue +INSTANTIATE(af_or_t, float , char) +INSTANTIATE(af_or_t, double , char) +INSTANTIATE(af_or_t, cfloat , char) +INSTANTIATE(af_or_t, cdouble, char) +INSTANTIATE(af_or_t, int , char) +INSTANTIATE(af_or_t, uint , char) +INSTANTIATE(af_or_t, intl , char) +INSTANTIATE(af_or_t, uintl , char) +INSTANTIATE(af_or_t, char , char) +INSTANTIATE(af_or_t, uchar , char) +INSTANTIATE(af_or_t, short , char) +INSTANTIATE(af_or_t, ushort , char) + +//alltrue +INSTANTIATE(af_and_t, float , char) +INSTANTIATE(af_and_t, double , char) +INSTANTIATE(af_and_t, cfloat , char) +INSTANTIATE(af_and_t, cdouble, char) +INSTANTIATE(af_and_t, int , char) +INSTANTIATE(af_and_t, uint , char) +INSTANTIATE(af_and_t, intl , char) +INSTANTIATE(af_and_t, uintl , char) +INSTANTIATE(af_and_t, char , char) +INSTANTIATE(af_and_t, uchar , char) +INSTANTIATE(af_and_t, short , char) +INSTANTIATE(af_and_t, ushort , char) + } diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp index b753fb5547..2384dd3341 100644 --- a/src/backend/cpu/regions.cpp +++ b/src/backend/cpu/regions.cpp @@ -17,186 +17,24 @@ #include #include #include +#include +#include +#include using af::dim4; namespace cpu { -template -class LabelNode -{ -private: - T label; - T minLabel; - unsigned rank; - LabelNode* parent; - -public: - LabelNode() : label(0), minLabel(0), rank(0), parent(this) { } - LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { } - - T getLabel() - { - return label; - } - - T getMinLabel() - { - return minLabel; - } - - LabelNode* getParent() - { - return parent; - } - - unsigned getRank() - { - return rank; - } - - void setMinLabel(T l) - { - minLabel = l; - } - - void setParent(LabelNode* p) - { - parent = p; - } - - void setRank(unsigned r) - { - rank = r; - } -}; - -template -static LabelNode* find(LabelNode* x) -{ - if (x->getParent() != x) - x->setParent(find(x->getParent())); - return x->getParent(); -} - -template -static void setUnion(LabelNode* x, LabelNode* y) -{ - LabelNode* xRoot = find(x); - LabelNode* yRoot = find(y); - if (xRoot == yRoot) - return; - - T xMinLabel = xRoot->getMinLabel(); - T yMinLabel = yRoot->getMinLabel(); - xRoot->setMinLabel(min(xMinLabel, yMinLabel)); - yRoot->setMinLabel(min(xMinLabel, yMinLabel)); - - if (xRoot->getRank() < yRoot->getRank()) - xRoot->setParent(yRoot); - else if (xRoot->getRank() > yRoot->getRank()) - yRoot->setParent(xRoot); - else { - yRoot->setParent(xRoot); - xRoot->setRank(xRoot->getRank() + 1); - } -} - template Array regions(const Array &in, af_connectivity connectivity) { - const dim4 in_dims = in.dims(); - - // Create output placeholder - Array out = createValueArray(in_dims, (T)0); - - const char *in_ptr = in.get(); - T *out_ptr = out.get(); - - // Map labels - typedef typename std::map* > label_map_t; - typedef typename label_map_t::iterator label_map_iterator_t; - - label_map_t lmap; - - // Initial label - T label = (T)1; - - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * in_dims[0] + i; - if (in_ptr[idx] != 0) { - std::vector l; - - // Test neighbors - if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0) - l.push_back(out_ptr[j * in_dims[0] + i-1]); - if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i]); - if (connectivity == AF_CONNECTIVITY_8 && i > 0 && j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]); - if (connectivity == AF_CONNECTIVITY_8 && i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]); - - if (!l.empty()) { - T minl = l[0]; - for (size_t k = 0; k < l.size(); k++) { - minl = min(l[k], minl); - label_map_iterator_t cur_map = lmap.find(l[k]); - LabelNode *node = cur_map->second; - // Group labels of the same region under a disjoint set - for (size_t m = k+1; m < l.size(); m++) - setUnion(node, lmap.find(l[m])->second); - } - // Set label to smallest neighbor label - out_ptr[idx] = minl; - } - else { - // Insert new label in map - LabelNode *node = new LabelNode(label); - lmap.insert(std::pair* >(label, node)); - out_ptr[idx] = label++; - } - } - } - } - - std::set removed; - - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * (int)in_dims[0] + i; - if (in_ptr[idx] != 0) { - T l = out_ptr[idx]; - label_map_iterator_t cur_map = lmap.find(l); - - if (cur_map != lmap.end()) { - LabelNode* node = cur_map->second; - - LabelNode* node_root = find(node); - out_ptr[idx] = node_root->getMinLabel(); + in.eval(); - // Mark removed labels (those that are part of a region - // that contains a smaller label) - if (node->getMinLabel() < l || node_root->getMinLabel() < l) - removed.insert(l); - if (node->getLabel() > node->getMinLabel()) - removed.insert(node->getLabel()); - } - } - } - } + Array out = createValueArray(in.dims(), (T)0); + out.eval(); - // Calculate final neighbors (ensure final labels are sequential) - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * (int)in_dims[0] + i; - if (out_ptr[idx] > 0) { - out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx])); - } - } - } + getQueue().enqueue(kernel::regions, out, in, connectivity); return out; } diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp index a9824a4444..bd156585ee 100644 --- a/src/backend/cpu/reorder.cpp +++ b/src/backend/cpu/reorder.cpp @@ -9,69 +9,42 @@ #include #include -#include -#include +#include +#include +#include namespace cpu { - template - Array reorder(const Array &in, const af::dim4 &rdims) - { - const af::dim4 iDims = in.dims(); - af::dim4 oDims(0); - for(int i = 0; i < 4; i++) - oDims[i] = iDims[rdims[i]]; - Array out = createEmptyArray(oDims); - - T* outPtr = out.get(); - const T* inPtr = in.get(); - - const af::dim4 ist = in.strides(); - const af::dim4 ost = out.strides(); - - - dim_t ids[4] = {0}; - for(dim_t ow = 0; ow < oDims[3]; ow++) { - const dim_t oW = ow * ost[3]; - ids[rdims[3]] = ow; - for(dim_t oz = 0; oz < oDims[2]; oz++) { - const dim_t oZW = oW + oz * ost[2]; - ids[rdims[2]] = oz; - for(dim_t oy = 0; oy < oDims[1]; oy++) { - const dim_t oYZW = oZW + oy * ost[1]; - ids[rdims[1]] = oy; - for(dim_t ox = 0; ox < oDims[0]; ox++) { - const dim_t oIdx = oYZW + ox; - - ids[rdims[0]] = ox; - const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] + - ids[1] * ist[1] + ids[0]; +template +Array reorder(const Array &in, const af::dim4 &rdims) +{ + in.eval(); - outPtr[oIdx] = inPtr[iIdx]; - } - } - } - } + const af::dim4 iDims = in.dims(); + af::dim4 oDims(0); + for(int i = 0; i < 4; i++) + oDims[i] = iDims[rdims[i]]; - return out; - } + Array out = createEmptyArray(oDims); + getQueue().enqueue(kernel::reorder, out, in, oDims, rdims); + return out; +} #define INSTANTIATE(T) \ template Array reorder(const Array &in, const af::dim4 &rdims); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(short) - INSTANTIATE(ushort) - +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(short) +INSTANTIATE(ushort) } diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp index 8c4da58934..eaeb5d4e3d 100644 --- a/src/backend/cpu/resize.cpp +++ b/src/backend/cpu/resize.cpp @@ -9,214 +9,54 @@ #include #include -#include -#include #include #include #include +#include +#include +#include namespace cpu { - /** - * noop function for round to avoid compilation - * issues due to lack of this function in C90 based - * compilers, it is only present in C99 and C++11 - * - * This is not a full fledged implementation, this function - * is to be used only for positive numbers, i m using it here - * for calculating dimensions of arrays - */ - dim_t round2int(float value) - { - return (dim_t)(value+0.5f); - } - - using std::conditional; - using std::is_same; - - template - using wtype_t = typename conditional::value, double, float>::type; - - template - using vtype_t = typename conditional::value, - T, wtype_t - >::type; - - template - struct resize_op - { - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - return; - } - }; - - template - struct resize_op - { - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0])); - dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1])); - - if (i_x >= idims[0]) i_x = idims[0] - 1; - if (i_y >= idims[1]) i_y = idims[1] - 1; - - dim_t i_off = i_y * istrides[1] + i_x; - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wost = w * ostrides[3]; - dim_t wist = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; - } - } - } - }; - - template - struct resize_op - { - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - float f_x = (float)x / (odims[0] / (float)idims[0]); - float f_y = (float)y / (odims[1] / (float)idims[1]); - - dim_t i1_x = floor(f_x); - dim_t i1_y = floor(f_y); - - if (i1_x >= idims[0]) i1_x = idims[0] - 1; - if (i1_y >= idims[1]) i1_y = idims[1] - 1; - - float b = f_x - i1_x; - float a = f_y - i1_y; - - dim_t i2_x = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1); - dim_t i2_y = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1); - - typedef typename dtype_traits::base_type BT; - typedef wtype_t WT; - typedef vtype_t VT; - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wst = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - dim_t zst = z * istrides[2]; - dim_t channel_off = zst + wst; - VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off]; - VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off]; - VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off]; - VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off]; - - outPtr[o_off + z * ostrides[2] + w * ostrides[3]] = - scalar((1.0f - a) * (1.0f - b)) * p1 + - scalar(( a ) * (1.0f - b)) * p2 + - scalar((1.0f - a) * ( b )) * p3 + - scalar(( a ) * ( b )) * p4; - } - } - } - }; - - template - struct resize_op - { - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - dim_t i_x = floor((float)x / (odims[0] / (float)idims[0])); - dim_t i_y = floor((float)y / (odims[1] / (float)idims[1])); - - if (i_x >= idims[0]) i_x = idims[0] - 1; - if (i_y >= idims[1]) i_y = idims[1] - 1; - - dim_t i_off = i_y * istrides[1] + i_x; - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wost = w * ostrides[3]; - dim_t wist = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; - } - } - } - }; - - template - void resize_(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides) - { - resize_op op; - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y); - } - } - } - - template - Array resize(const Array &in, const dim_t odim0, const dim_t odim1, - const af_interp_type method) - { - af::dim4 idims = in.dims(); - af::dim4 odims(odim0, odim1, idims[2], idims[3]); - - // Create output placeholder - Array outArray = createValueArray(odims, (T)0); - - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - af::dim4 ostrides = outArray.strides(); - af::dim4 istrides = in.strides(); - - switch(method) { - case AF_INTERP_NEAREST: - resize_(outPtr, inPtr, odims, idims, ostrides, istrides); - break; - case AF_INTERP_BILINEAR: - resize_(outPtr, inPtr, odims, idims, ostrides, istrides); - break; - case AF_INTERP_LOWER: - resize_(outPtr, inPtr, odims, idims, ostrides, istrides); - break; - default: - break; - } - return outArray; +template +Array resize(const Array &in, const dim_t odim0, const dim_t odim1, + const af_interp_type method) +{ + af::dim4 idims = in.dims(); + af::dim4 odims(odim0, odim1, idims[2], idims[3]); + // Create output placeholder + Array out = createValueArray(odims, (T)0); + out.eval(); + in.eval(); + + switch(method) { + case AF_INTERP_NEAREST: + getQueue().enqueue(kernel::resize, out, in); break; + case AF_INTERP_BILINEAR: + getQueue().enqueue(kernel::resize, out, in); break; + case AF_INTERP_LOWER: + getQueue().enqueue(kernel::resize, out, in); break; + default: break; } + return out; +} - -#define INSTANTIATE(T) \ +#define INSTANTIATE(T) \ template Array resize (const Array &in, const dim_t odim0, const dim_t odim1, \ const af_interp_type method); +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) } diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp index a4af64b669..0fb9b17674 100644 --- a/src/backend/cpu/rotate.cpp +++ b/src/backend/cpu/rotate.cpp @@ -9,112 +9,56 @@ #include #include -#include -#include -#include +#include +#include #include "transform_interp.hpp" +#include namespace cpu { - template - void rotate_(T *out, const T *in, const float theta, - const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides) - { - dim_t nimages = idims[2]; - void (*t_fn)(T *, const T *, const float *, const af::dim4 &, - const af::dim4 &, const af::dim4 &, - const dim_t, const dim_t, const dim_t, const dim_t); - - const float c = cos(-theta), s = sin(-theta); - float tx, ty; - { - const float nx = 0.5 * (idims[0] - 1); - const float ny = 0.5 * (idims[1] - 1); - const float mx = 0.5 * (odims[0] - 1); - const float my = 0.5 * (odims[1] - 1); - const float sx = (mx * c + my *-s); - const float sy = (mx * s + my * c); - tx = -(sx - nx); - ty = -(sy - ny); - } - - const float tmat[6] = {std::round( c * 1000) / 1000.0f, - std::round(-s * 1000) / 1000.0f, - std::round(tx * 1000) / 1000.0f, - std::round( s * 1000) / 1000.0f, - std::round( c * 1000) / 1000.0f, - std::round(ty * 1000) / 1000.0f, - }; - - switch(method) { - case AF_INTERP_NEAREST: - t_fn = &transform_n; - break; - case AF_INTERP_BILINEAR: - t_fn = &transform_b; - break; - case AF_INTERP_LOWER: - t_fn = &transform_l; - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } +template +Array rotate(const Array &in, const float theta, const af::dim4 &odims, + const af_interp_type method) +{ + in.eval(); + Array out = createEmptyArray(odims); - // Do transform for image - for(int yy = 0; yy < (int)odims[1]; yy++) { - for(int xx = 0; xx < (int)odims[0]; xx++) { - t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy); - } - } + switch(method) { + case AF_INTERP_NEAREST: + getQueue().enqueue(kernel::rotate, out, in, theta); + break; + case AF_INTERP_BILINEAR: + getQueue().enqueue(kernel::rotate, out, in, theta); + break; + case AF_INTERP_LOWER: + getQueue().enqueue(kernel::rotate, out, in, theta); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; } - template - Array rotate(const Array &in, const float theta, const af::dim4 &odims, - const af_interp_type method) - { - Array out = createEmptyArray(odims); - const af::dim4 idims = in.dims(); - - switch(method) { - case AF_INTERP_NEAREST: - rotate_ - (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides()); - break; - case AF_INTERP_BILINEAR: - rotate_ - (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides()); - break; - case AF_INTERP_LOWER: - rotate_ - (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides()); - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } - - return out; - } + return out; +} #define INSTANTIATE(T) \ template Array rotate(const Array &in, const float theta, \ const af::dim4 &odims, const af_interp_type method); - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp index 2bdda210a2..78de4142c8 100644 --- a/src/backend/cpu/scan.cpp +++ b/src/backend/cpu/scan.cpp @@ -14,102 +14,60 @@ #include #include #include +#include +#include +#include using af::dim4; namespace cpu { - template - struct scan_dim - { - void operator()(To *out, const dim4 ostrides, const dim4 odims, - const Ti *in , const dim4 istrides, const dim4 idims, - const int dim) - { - const int D1 = D - 1; - for (dim_t i = 0; i < odims[D1]; i++) { - scan_dim()(out + i * ostrides[D1], - ostrides, odims, - in + i * istrides[D1], - istrides, idims, - dim); - if (D1 == dim) break; - } - } - }; - template - struct scan_dim - { - void operator()(To *out, const dim4 ostrides, const dim4 odims, - const Ti *in , const dim4 istrides, const dim4 idims, - const int dim) - { - - dim_t istride = istrides[dim]; - dim_t ostride = ostrides[dim]; - - Transform transform; - // FIXME: Change the name to something better - Binary scan; - - To out_val = scan.init(); - for (dim_t i = 0; i < idims[dim]; i++) { - To in_val = transform(in[i * istride]); - out_val = scan(in_val, out_val); - out[i * ostride] = out_val; - } - } - }; - - template - Array scan(const Array& in, const int dim) - { - dim4 dims = in.dims(); - - Array out = createValueArray(dims, 0); +template +Array scan(const Array& in, const int dim) +{ + dim4 dims = in.dims(); + Array out = createEmptyArray(dims); + in.eval(); - switch (in.ndims()) { + switch (in.ndims()) { case 1: - scan_dim()(out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim); + kernel::scan_dim func1; + getQueue().enqueue(func1, out, 0, in, 0, dim); break; - case 2: - scan_dim()(out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim); + kernel::scan_dim func2; + getQueue().enqueue(func2, out, 0, in, 0, dim); break; - case 3: - scan_dim()(out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim); + kernel::scan_dim func3; + getQueue().enqueue(func3, out, 0, in, 0, dim); break; - case 4: - scan_dim()(out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim); + kernel::scan_dim func4; + getQueue().enqueue(func4, out, 0, in, 0, dim); break; - } - - return out; } + return out; +} + #define INSTANTIATE(ROp, Ti, To) \ template Array scan(const Array &in, const int dim); \ - //accum - INSTANTIATE(af_add_t, float , float ) - INSTANTIATE(af_add_t, double , double ) - INSTANTIATE(af_add_t, cfloat , cfloat ) - INSTANTIATE(af_add_t, cdouble, cdouble) - INSTANTIATE(af_add_t, int , int ) - INSTANTIATE(af_add_t, uint , uint ) - INSTANTIATE(af_add_t, intl , intl ) - INSTANTIATE(af_add_t, uintl , uintl ) - INSTANTIATE(af_add_t, char , int ) - INSTANTIATE(af_add_t, uchar , uint ) - INSTANTIATE(af_add_t, short , int ) - INSTANTIATE(af_add_t, ushort , uint ) - INSTANTIATE(af_notzero_t, char , uint ) +//accum +INSTANTIATE(af_add_t, float , float ) +INSTANTIATE(af_add_t, double , double ) +INSTANTIATE(af_add_t, cfloat , cfloat ) +INSTANTIATE(af_add_t, cdouble, cdouble) +INSTANTIATE(af_add_t, int , int ) +INSTANTIATE(af_add_t, uint , uint ) +INSTANTIATE(af_add_t, intl , intl ) +INSTANTIATE(af_add_t, uintl , uintl ) +INSTANTIATE(af_add_t, char , int ) +INSTANTIATE(af_add_t, uchar , uint ) +INSTANTIATE(af_add_t, short , int ) +INSTANTIATE(af_add_t, ushort , uint ) +INSTANTIATE(af_notzero_t, char , uint) } diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp index 7b2cc81735..1545a81f46 100644 --- a/src/backend/cpu/select.cpp +++ b/src/backend/cpu/select.cpp @@ -6,117 +6,37 @@ * The complete license agreement can be obtained at: * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ + #include #include #include -#include +#include +#include +#include using af::dim4; namespace cpu { - template - void select(Array &out, const Array &cond, const Array &a, const Array &b) - { - dim4 adims = a.dims(); - dim4 astrides = a.strides(); - dim4 bdims = b.dims(); - dim4 bstrides = b.strides(); - - dim4 cdims = cond.dims(); - dim4 cstrides = cond.strides(); - - dim4 odims = out.dims(); - dim4 ostrides = out.strides(); - - bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1], - adims[2] == odims[2], adims[3] == odims[3]}; - - bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1], - bdims[2] == odims[2], bdims[3] == odims[3]}; - - bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1], - cdims[2] == odims[2], cdims[3] == odims[3]}; - - const T *aptr = a.get(); - const T *bptr = b.get(); - T *optr = out.get(); - const char *cptr = cond.get(); - - for (int l = 0; l < odims[3]; l++) { - - int o_off3 = ostrides[3] * l; - int a_off3 = astrides[3] * is_a_same[3] * l; - int b_off3 = bstrides[3] * is_b_same[3] * l; - int c_off3 = cstrides[3] * is_c_same[3] * l; - - for (int k = 0; k < odims[2]; k++) { - - int o_off2 = ostrides[2] * k + o_off3; - int a_off2 = astrides[2] * is_a_same[2] * k + a_off3; - int b_off2 = bstrides[2] * is_b_same[2] * k + b_off3; - int c_off2 = cstrides[2] * is_c_same[2] * k + c_off3; - - for (int j = 0; j < odims[1]; j++) { - - int o_off1 = ostrides[1] * j + o_off2; - int a_off1 = astrides[1] * is_a_same[1] * j + a_off2; - int b_off1 = bstrides[1] * is_b_same[1] * j + b_off2; - int c_off1 = cstrides[1] * is_c_same[1] * j + c_off2; - - for (int i = 0; i < odims[0]; i++) { - - bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1]; - T aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1]; - T bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1]; - T oval = cval ? aval : bval; - optr[o_off1 + i] = oval; - } - } - } - } - } - template - void select_scalar(Array &out, const Array &cond, const Array &a, const double &b) - { - dim4 astrides = a.strides(); - dim4 cstrides = cond.strides(); - - dim4 odims = out.dims(); - dim4 ostrides = out.strides(); - - const T *aptr = a.get(); - T *optr = out.get(); - const char *cptr = cond.get(); - - for (int l = 0; l < odims[3]; l++) { - - int o_off3 = ostrides[3] * l; - int a_off3 = astrides[3] * l; - int c_off3 = cstrides[3] * l; - - for (int k = 0; k < odims[2]; k++) { - - int o_off2 = ostrides[2] * k + o_off3; - int a_off2 = astrides[2] * k + a_off3; - int c_off2 = cstrides[2] * k + c_off3; - - for (int j = 0; j < odims[1]; j++) { - - int o_off1 = ostrides[1] * j + o_off2; - int a_off1 = astrides[1] * j + a_off2; - int c_off1 = cstrides[1] * j + c_off2; - - for (int i = 0; i < odims[0]; i++) { - - optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b; - } - } - } - } - } +template +void select(Array &out, const Array &cond, const Array &a, const Array &b) +{ + out.eval(); + cond.eval(); + a.eval(); + b.eval(); + getQueue().enqueue(kernel::select, out, cond, a, b); +} +template +void select_scalar(Array &out, const Array &cond, const Array &a, const double &b) +{ + out.eval(); + cond.eval(); + a.eval(); + getQueue().enqueue(kernel::select_scalar, out, cond, a, b); +} #define INSTANTIATE(T) \ template void select(Array &out, const Array &cond, \ @@ -130,16 +50,17 @@ namespace cpu const Array &a, \ const double &b); \ - INSTANTIATE(float ) - INSTANTIATE(double ) - INSTANTIATE(cfloat ) - INSTANTIATE(cdouble) - INSTANTIATE(int ) - INSTANTIATE(uint ) - INSTANTIATE(intl ) - INSTANTIATE(uintl ) - INSTANTIATE(char ) - INSTANTIATE(uchar ) - INSTANTIATE(short ) - INSTANTIATE(ushort ) +INSTANTIATE(float ) +INSTANTIATE(double ) +INSTANTIATE(cfloat ) +INSTANTIATE(cdouble) +INSTANTIATE(int ) +INSTANTIATE(uint ) +INSTANTIATE(intl ) +INSTANTIATE(uintl ) +INSTANTIATE(char ) +INSTANTIATE(uchar ) +INSTANTIATE(short ) +INSTANTIATE(ushort ) + } diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp index 3215e6d5c2..d6c2a611e0 100644 --- a/src/backend/cpu/set.cpp +++ b/src/backend/cpu/set.cpp @@ -18,105 +18,123 @@ #include #include #include +#include +#include namespace cpu { - using namespace std; - using af::dim4; - - template - Array setUnique(const Array &in, - const bool is_sorted) - { - Array out = createEmptyArray(af::dim4()); - if (is_sorted) out = copyArray(in); - else out = sort(in, 0); - - T *ptr = out.get(); - T *last = std::unique(ptr, ptr + in.elements()); - dim_t dist = (dim_t)std::distance(ptr, last); - - dim4 dims(dist, 1, 1, 1); - out.resetDims(dims); - return out; - } - template - Array setUnion(const Array &first, - const Array &second, - const bool is_unique) - { - Array uFirst = first; - Array uSecond = second; +using namespace std; +using af::dim4; + +template +Array setUnique(const Array &in, + const bool is_sorted) +{ + in.eval(); + + Array out = createEmptyArray(af::dim4()); + if (is_sorted) out = copyArray(in); + else out = sort(in, 0); - if (!is_unique) { - // FIXME: Perhaps copy + unique would do ? - uFirst = setUnique(first, false); - uSecond = setUnique(second, false); - } + // Need to sync old jobs since we need to + // operator on pointers directly in std::unique + getQueue().sync(); - dim_t first_elements = uFirst.elements(); - dim_t second_elements = uSecond.elements(); - dim_t elements = first_elements + second_elements; + T *ptr = out.get(); + T *last = std::unique(ptr, ptr + in.elements()); + dim_t dist = (dim_t)std::distance(ptr, last); - Array out = createEmptyArray(af::dim4(elements)); + dim4 dims(dist, 1, 1, 1); + out.resetDims(dims); + return out; +} - T *ptr = out.get(); - T *last = std::set_union(uFirst.get() , uFirst.get() + first_elements, - uSecond.get(), uSecond.get() + second_elements, - ptr); +template +Array setUnion(const Array &first, + const Array &second, + const bool is_unique) +{ + first.eval(); + second.eval(); + getQueue().sync(); - dim_t dist = (dim_t)std::distance(ptr, last); - dim4 dims(dist, 1, 1, 1); - out.resetDims(dims); + Array uFirst = first; + Array uSecond = second; - return out; + if (!is_unique) { + // FIXME: Perhaps copy + unique would do ? + uFirst = setUnique(first, false); + uSecond = setUnique(second, false); } - template - Array setIntersect(const Array &first, - const Array &second, - const bool is_unique) - { - Array uFirst = first; - Array uSecond = second; + dim_t first_elements = uFirst.elements(); + dim_t second_elements = uSecond.elements(); + dim_t elements = first_elements + second_elements; - if (!is_unique) { - uFirst = setUnique(first, false); - uSecond = setUnique(second, false); - } + Array out = createEmptyArray(af::dim4(elements)); - dim_t first_elements = uFirst.elements(); - dim_t second_elements = uSecond.elements(); - dim_t elements = std::max(first_elements, second_elements); + T *ptr = out.get(); + T *last = std::set_union(uFirst.get() , uFirst.get() + first_elements, + uSecond.get(), uSecond.get() + second_elements, + ptr); - Array out = createEmptyArray(af::dim4(elements)); + dim_t dist = (dim_t)std::distance(ptr, last); + dim4 dims(dist, 1, 1, 1); + out.resetDims(dims); + + return out; +} - T *ptr = out.get(); - T *last = std::set_intersection(uFirst.get() , uFirst.get() + first_elements, - uSecond.get(), uSecond.get() + second_elements, - ptr); +template +Array setIntersect(const Array &first, + const Array &second, + const bool is_unique) +{ + first.eval(); + second.eval(); + getQueue().sync(); - dim_t dist = (dim_t)std::distance(ptr, last); - dim4 dims(dist, 1, 1, 1); - out.resetDims(dims); + Array uFirst = first; + Array uSecond = second; - return out; + if (!is_unique) { + uFirst = setUnique(first, false); + uSecond = setUnique(second, false); } + dim_t first_elements = uFirst.elements(); + dim_t second_elements = uSecond.elements(); + dim_t elements = std::max(first_elements, second_elements); + + Array out = createEmptyArray(af::dim4(elements)); + + T *ptr = out.get(); + T *last = std::set_intersection(uFirst.get() , uFirst.get() + first_elements, + uSecond.get(), uSecond.get() + second_elements, + ptr); + + dim_t dist = (dim_t)std::distance(ptr, last); + dim4 dims(dist, 1, 1, 1); + out.resetDims(dims); + + return out; +} + #define INSTANTIATE(T) \ template Array setUnique(const Array &in, const bool is_sorted); \ template Array setUnion(const Array &first, const Array &second, const bool is_unique); \ template Array setIntersect(const Array &first, const Array &second, const bool is_unique); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) - INSTANTIATE(intl) - INSTANTIATE(uintl) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) +INSTANTIATE(intl) +INSTANTIATE(uintl) + } diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp index 05cac4c678..041f1ab8ba 100644 --- a/src/backend/cpu/shift.cpp +++ b/src/backend/cpu/shift.cpp @@ -9,80 +9,40 @@ #include #include -#include -#include -#include +#include +#include +#include namespace cpu { - static inline dim_t simple_mod(const dim_t i, const dim_t dim) - { - return (i < dim) ? i : (i - dim); - } - template - Array shift(const Array &in, const int sdims[4]) - { - const af::dim4 iDims = in.dims(); - af::dim4 oDims = iDims; - - Array out = createEmptyArray(oDims); - - T* outPtr = out.get(); - const T* inPtr = in.get(); - - const af::dim4 ist = in.strides(); - const af::dim4 ost = out.strides(); - - int sdims_[4]; - // Need to do this because we are mapping output to input in the kernel - for(int i = 0; i < 4; i++) { - // sdims_[i] will always be positive and always [0, oDims[i]]. - // Negative shifts are converted to position by going the other way round - sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0); - assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]); - } +template +Array shift(const Array &in, const int sdims[4]) +{ + in.eval(); - for(dim_t ow = 0; ow < oDims[3]; ow++) { - const int oW = ow * ost[3]; - const int iw = simple_mod((ow + sdims_[3]), oDims[3]); - const int iW = iw * ist[3]; - for(dim_t oz = 0; oz < oDims[2]; oz++) { - const int oZW = oW + oz * ost[2]; - const int iz = simple_mod((oz + sdims_[2]), oDims[2]); - const int iZW = iW + iz * ist[2]; - for(dim_t oy = 0; oy < oDims[1]; oy++) { - const int oYZW = oZW + oy * ost[1]; - const int iy = simple_mod((oy + sdims_[1]), oDims[1]); - const int iYZW = iZW + iy * ist[1]; - for(dim_t ox = 0; ox < oDims[0]; ox++) { - const int oIdx = oYZW + ox; - const int ix = simple_mod((ox + sdims_[0]), oDims[0]); - const int iIdx = iYZW + ix; + Array out = createEmptyArray(in.dims()); + const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]); - outPtr[oIdx] = inPtr[iIdx]; - } - } - } - } + getQueue().enqueue(kernel::shift, out, in, temp); - return out; - } + return out; +} #define INSTANTIATE(T) \ template Array shift(const Array &in, const int sdims[4]); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) } diff --git a/src/backend/cpu/sift.cpp b/src/backend/cpu/sift.cpp index 70bb11d1ae..0345e37485 100644 --- a/src/backend/cpu/sift.cpp +++ b/src/backend/cpu/sift.cpp @@ -21,8 +21,8 @@ #include #include -#ifdef AF_BUILD_SIFT -#include +#ifdef AF_BUILD_NONFREE_SIFT +#include #endif using af::dim4; @@ -39,7 +39,7 @@ unsigned sift(Array& x, Array& y, Array& score, const float img_scale, const float feature_ratio, const bool compute_GLOH) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT return sift_impl(x, y, score, ori, size, desc, in, n_layers, contrast_thr, edge_thr, init_sigma, double_input, img_scale, feature_ratio, compute_GLOH); diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp deleted file mode 100644 index 514a134c7d..0000000000 --- a/src/backend/cpu/sift_nonfree.hpp +++ /dev/null @@ -1,1193 +0,0 @@ -/******************************************************* - * Copyright (c) 2015, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -// The source code contained in this file is based on the original code by -// Rob Hess. Please note that SIFT is an algorithm patented and protected -// by US law, before using this code or any binary forms generated from it, -// verify that you have permission to do so. The original license by Rob Hess -// can be read below: -// -// Copyright (c) 2006-2012, Rob Hess -// All rights reserved. -// -// The following patent has been issued for methods embodied in this -// software: "Method and apparatus for identifying scale invariant features -// in an image and use of same for locating an object in an image," David -// G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application -// filed March 8, 1999. Asignee: The University of British Columbia. For -// further details, contact David Lowe (lowe@cs.ubc.ca) or the -// University-Industry Liaison Office of the University of British -// Columbia. -// -// Note that restrictions imposed by this patent (and possibly others) -// exist independently of and may be in conflict with the freedoms granted -// in this license, which refers to copyright of the program, not patents -// for any methods that it implements. Both copyright and patent law must -// be obeyed to legally use and redistribute this program and it is not the -// purpose of this license to induce you to infringe any patents or other -// property right claims or to contest validity of any such claims. If you -// redistribute or use the program, then this license merely protects you -// from committing copyright infringement. It does not protect you from -// committing patent infringement. So, before you do anything with this -// program, make sure that you have permission to do so not merely in terms -// of copyright, but also in terms of patent law. -// -// Please note that this license is not to be understood as a guarantee -// either. If you use the program according to this license, but in -// conflict with patent law, it does not mean that the licensor will refund -// you for any losses that you incur if you are sued for your patent -// infringement. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// * Redistributions of source code must retain the above copyright and -// patent notices, this list of conditions and the following -// disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in -// the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Oregon State University nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -using af::dim4; - -namespace cpu -{ - - static const float PI_VAL = 3.14159265358979323846f; - -// default width of descriptor histogram array - static const int DescrWidth = 4; - -// default number of bins per histogram in descriptor array - static const int DescrHistBins = 8; - -// assumed gaussian blur for input image - static const float InitSigma = 0.5f; - -// width of border in which to ignore keypoints - static const int ImgBorder = 5; - -// maximum steps of keypoint interpolation before failure - static const int MaxInterpSteps = 5; - -// default number of bins in histogram for orientation assignment - static const int OriHistBins = 36; - -// determines gaussian sigma for orientation assignment - static const float OriSigFctr = 1.5f; - -// determines the radius of the region used in orientation assignment */ - static const float OriRadius = 3.0f * OriSigFctr; - -// number of passes of orientation histogram smoothing - static const int SmoothOriPasses = 2; - -// orientation magnitude relative to max that results in new feature - static const float OriPeakRatio = 0.8f; - -// determines the size of a single descriptor orientation histogram - static const float DescrSclFctr = 3.f; - -// threshold on magnitude of elements of descriptor vector - static const float DescrMagThr = 0.2f; - -// factor used to convert floating-point descriptor to unsigned char - static const float IntDescrFctr = 512.f; - -// Number of GLOH bins in radial direction - static const unsigned GLOHRadialBins = 3; - -// Radiuses of GLOH descriptors - static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f}; - -// Number of GLOH angular bins (excluding the inner-most radial section) - static const unsigned GLOHAngularBins = 8; - -// Number of GLOH bins per histogram in descriptor - static const unsigned GLOHHistBins = 16; - - typedef struct - { - float f[4]; - unsigned l; - } feat_t; - - bool feat_cmp(feat_t i, feat_t j) - { - for (int k = 0; k < 4; k++) - if (i.f[k] != j.f[k]) - return (i.f[k] < j.f[k]); - if (i.l != j.l) - return (i.l < j.l); - - return true; - } - - void array_to_feat(std::vector& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat) - { - feat.resize(nfeat); - for (unsigned i = 0; i < feat.size(); i++) { - feat[i].f[0] = x[i]; - feat[i].f[1] = y[i]; - feat[i].f[2] = resp[i]; - feat[i].f[3] = size[i]; - feat[i].l = layer[i]; - } - } - - template - void gaussian1D(T* out, const int dim, double sigma=0.0) - { - if(!(sigma>0)) sigma = 0.25*dim; - - T sum = (T)0; - for(int i=0;i - Array gauss_filter(float sigma) - { - // Using 6-sigma rule - unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u); - - Array filter = createEmptyArray(gauss_len); - gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma); - - return filter; - } - - template - void gaussianElimination(float* A, float* b, float* x) - { - // forward elimination - for (int i = 0; i < N-1; i++) { - for (int j = i+1; j < N; j++) { - float s = A[j*N+i] / A[i*N+i]; - - for (int k = i; k < N; k++) - A[j*N+k] -= s * A[i*N+k]; - - b[j] -= s * b[i]; - } - } - - for (int i = 0; i < N; i++) - x[i] = 0; - - // backward substitution - float sum = 0; - for (int i = 0; i <= N-2; i++) { - sum = b[i]; - for (int j = i+1; j < N; j++) - sum -= A[i*N+j] * x[j]; - x[i] = sum / A[i*N+i]; - } - } - - template - void sub( - Array& out, - const Array& in1, - const Array& in2) - { - size_t nel = in1.elements(); - T* out_ptr = out.get(); - const T* in1_ptr = in1.get(); - const T* in2_ptr = in2.get(); - - for (size_t i = 0; i < nel; i++) { - out_ptr[i] = in1_ptr[i] - in2_ptr[i]; - } - } - -#define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)]) -#define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)]) -#define NPTR(Y, X) (next_ptr[(Y) * idims[0] + (X)]) - -// Determines whether a pixel is a scale-space extremum by comparing it to its -// 3x3x3 pixel neighborhood. - template - void detectExtrema( - float* x_out, - float* y_out, - unsigned* layer_out, - unsigned* counter, - const Array& prev, - const Array& center, - const Array& next, - const unsigned layer, - const unsigned max_feat, - const float threshold) - { - const af::dim4 idims = center.dims(); - const T* prev_ptr = prev.get(); - const T* center_ptr = center.get(); - const T* next_ptr = next.get(); - - for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) { - for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) { - float p = center_ptr[y*idims[0] + x]; - - // Find extrema - if (abs((float)p) > threshold && - ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) && - p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y, x+1) && - p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1) && - p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1) && - p > PPTR(y, x-1) && p > PPTR(y , x) && p > PPTR(y, x+1) && - p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1) && - p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1) && - p > NPTR(y, x-1) && p > NPTR(y , x) && p > NPTR(y, x+1) && - p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) || - (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) && - p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y, x+1) && - p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1) && - p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1) && - p < PPTR(y, x-1) && p < PPTR(y , x) && p < PPTR(y, x+1) && - p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1) && - p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1) && - p < NPTR(y, x-1) && p < NPTR(y , x) && p < NPTR(y, x+1) && - p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) { - - if (*counter < max_feat) - { - x_out[*counter] = (float)y; - y_out[*counter] = (float)x; - layer_out[*counter] = layer; - (*counter)++; - } - } - } - } - } - -// Interpolates a scale-space extremum's location and scale to subpixel -// accuracy to form an image feature. Rejects features with low contrast. -// Based on Section 4 of Lowe's paper. - template - void interpolateExtrema( - float* x_out, - float* y_out, - unsigned* layer_out, - float* response_out, - float* size_out, - unsigned* counter, - const float* x_in, - const float* y_in, - const unsigned* layer_in, - const unsigned extrema_feat, - std::vector< Array >& dog_pyr, - const unsigned max_feat, - const unsigned octave, - const unsigned n_layers, - const float contrast_thr, - const float edge_thr, - const float sigma, - const float img_scale) - { - for (int f = 0; f < (int)extrema_feat; f++) { - const float first_deriv_scale = img_scale*0.5f; - const float second_deriv_scale = img_scale; - const float cross_deriv_scale = img_scale*0.25f; - - float xl = 0, xy = 0, xx = 0, contr = 0; - int i = 0; - - unsigned x = x_in[f]; - unsigned y = y_in[f]; - unsigned layer = layer_in[f]; - - const T* prev_ptr = dog_pyr[octave*(n_layers+2) + layer-1].get(); - const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get(); - const T* next_ptr = dog_pyr[octave*(n_layers+2) + layer+1].get(); - - af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims(); - - bool converges = true; - - for (i = 0; i < MaxInterpSteps; i++) { - float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale, - (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale, - (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale}; - - float d2 = CPTR(x, y) * 2.f; - float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale; - float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale; - float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale; - float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) - - CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale; - float dxs = (NPTR(x+1, y) - NPTR(x-1, y) - - PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale; - float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) - - PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale; - - float H[9] = {dxx, dxy, dxs, - dxy, dyy, dys, - dxs, dys, dss}; - - float X[3]; - gaussianElimination<3>(H, dD, X); - - xl = -X[2]; - xy = -X[1]; - xx = -X[0]; - - if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f) - break; - - x += round(xx); - y += round(xy); - layer += round(xl); - - if (layer < 1 || layer > n_layers || - x < ImgBorder || x >= idims[1] - ImgBorder || - y < ImgBorder || y >= idims[0] - ImgBorder) { - converges = false; - break; - } - } - - // ensure convergence of interpolation - if (i >= MaxInterpSteps || !converges) - continue; - - float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale, - (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale, - (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale}; - float X[3] = {xx, xy, xl}; - - float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2]; - - contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f; - if(abs(contr) < (contrast_thr / n_layers)) - continue; - - // principal curvatures are computed using the trace and det of Hessian - float d2 = CPTR(x, y) * 2.f; - float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale; - float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale; - float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) - - CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale; - - float tr = dxx + dyy; - float det = dxx * dyy - dxy * dxy; - - // add FLT_EPSILON for double-precision compatibility - if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON) - continue; - - if (*counter < max_feat) - { - x_out[*counter] = (x + xx) * (1 << octave); - y_out[*counter] = (y + xy) * (1 << octave); - layer_out[*counter] = layer; - response_out[*counter] = abs(contr); - size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f; - (*counter)++; - } - } - } - -#undef CPTR -#undef PPTR -#undef NPTR - -// Remove duplicate keypoints - void removeDuplicates( - float* x_out, - float* y_out, - unsigned* layer_out, - float* response_out, - float* size_out, - unsigned* counter, - const std::vector& sorted_feat) - { - size_t nfeat = sorted_feat.size(); - - for (size_t f = 0; f < nfeat; f++) { - float prec_fctr = 1e4f; - - if (f < nfeat-1) { - if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) && - round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) && - round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) && - round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) && - sorted_feat[f].l == sorted_feat[f+1].l) - continue; - } - - x_out[*counter] = sorted_feat[f].f[0]; - y_out[*counter] = sorted_feat[f].f[1]; - response_out[*counter] = sorted_feat[f].f[2]; - size_out[*counter] = sorted_feat[f].f[3]; - layer_out[*counter] = sorted_feat[f].l; - (*counter)++; - } - } - -#define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)]) - -// Computes a canonical orientation for each image feature in an array. Based -// on Section 5 of Lowe's paper. This function adds features to the array when -// there is more than one dominant orientation at a given feature location. - template - void calcOrientation( - float* x_out, - float* y_out, - unsigned* layer_out, - float* response_out, - float* size_out, - float* ori_out, - unsigned* counter, - const float* x_in, - const float* y_in, - const unsigned* layer_in, - const float* response_in, - const float* size_in, - const unsigned total_feat, - const std::vector< Array >& gauss_pyr, - const unsigned max_feat, - const unsigned octave, - const unsigned n_layers, - const bool double_input) - { - const int n = OriHistBins; - - float hist[OriHistBins]; - float temphist[OriHistBins]; - - for (unsigned f = 0; f < total_feat; f++) { - // Load keypoint information - const float real_x = x_in[f]; - const float real_y = y_in[f]; - const unsigned layer = layer_in[f]; - const float response = response_in[f]; - const float size = size_in[f]; - - const int pt_x = (int)round(real_x / (1 << octave)); - const int pt_y = (int)round(real_y / (1 << octave)); - - // Calculate auxiliary parameters - const float scl_octv = size*0.5f / (1 << octave); - const int radius = (int)round(OriRadius * scl_octv); - const float sigma = OriSigFctr * scl_octv; - const int len = (radius*2+1); - const float exp_denom = 2.f * sigma * sigma; - - // Points img to correct Gaussian pyramid layer - const Array img = gauss_pyr[octave*(n_layers+3) + layer]; - const T* img_ptr = img.get(); - - for (int i = 0; i < OriHistBins; i++) - hist[i] = 0.f; - - af::dim4 idims = img.dims(); - - // Calculate orientation histogram - for (int l = 0; l < len*len; l++) { - int i = l / len - radius; - int j = l % len - radius; - - int y = pt_y + i; - int x = pt_x + j; - if (y < 1 || y >= idims[0] - 1 || - x < 1 || x >= idims[1] - 1) - continue; - - float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); - float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); - - float mag = sqrt(dx*dx+dy*dy); - float ori = atan2(dy,dx); - float w = exp(-(i*i + j*j)/exp_denom); - - int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL)); - bin = bin < n ? bin : 0; - - hist[bin] += w*mag; - } - - for (int i = 0; i < SmoothOriPasses; i++) { - for (int j = 0; j < n; j++) { - temphist[j] = hist[j]; - } - for (int j = 0; j < n; j++) { - float prev = (j == 0) ? temphist[n-1] : temphist[j-1]; - float next = (j+1 == n) ? temphist[0] : temphist[j+1]; - hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next; - } - } - - float omax = hist[0]; - for (int i = 1; i < n; i++) - omax = max(omax, hist[i]); - - float mag_thr = (float)(omax * OriPeakRatio); - int l, r; - for (int j = 0; j < n; j++) { - l = (j == 0) ? n - 1 : j - 1; - r = (j + 1) % n; - if (hist[j] > hist[l] && - hist[j] > hist[r] && - hist[j] >= mag_thr) { - if (*counter < max_feat) { - float bin = j + 0.5f * (hist[l] - hist[r]) / - (hist[l] - 2.0f*hist[j] + hist[r]); - bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin; - float ori = 360.f - ((360.f/n) * bin); - - float new_real_x = real_x; - float new_real_y = real_y; - float new_size = size; - - if (double_input) { - float scale = 0.5f; - new_real_x *= scale; - new_real_y *= scale; - new_size *= scale; - } - - x_out[*counter] = new_real_x; - y_out[*counter] = new_real_y; - layer_out[*counter] = layer; - response_out[*counter] = response; - size_out[*counter] = new_size; - ori_out[*counter] = ori; - (*counter)++; - } - } - } - } - } - - void normalizeDesc( - float* desc, - const int histlen) - { - float len_sq = 0.0f; - - for (int i = 0; i < histlen; i++) - len_sq += desc[i] * desc[i]; - - float len_inv = 1.0f / sqrt(len_sq); - - for (int i = 0; i < histlen; i++) { - desc[i] *= len_inv; - } - } - -// Computes feature descriptors for features in an array. Based on Section 6 -// of Lowe's paper. - template - void computeDescriptor( - float* desc_out, - const unsigned desc_len, - const float* x_in, - const float* y_in, - const unsigned* layer_in, - const float* response_in, - const float* size_in, - const float* ori_in, - const unsigned total_feat, - const std::vector< Array >& gauss_pyr, - const int d, - const int n, - const float scale, - const unsigned octave, - const unsigned n_layers) - { - float desc[128]; - - for (unsigned f = 0; f < total_feat; f++) { - const unsigned layer = layer_in[f]; - float ori = (360.f - ori_in[f]) * PI_VAL / 180.f; - ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori; - const float size = size_in[f]; - const int fx = round(x_in[f] * scale); - const int fy = round(y_in[f] * scale); - - // Points img to correct Gaussian pyramid layer - Array img = gauss_pyr[octave*(n_layers+3) + layer]; - const T* img_ptr = img.get(); - af::dim4 idims = img.dims(); - - float cos_t = cos(ori); - float sin_t = sin(ori); - float bins_per_rad = n / (PI_VAL * 2.f); - float exp_denom = d * d * 0.5f; - float hist_width = DescrSclFctr * size * scale * 0.5f; - int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f; - - int len = radius*2+1; - - for (int i = 0; i < (int)desc_len; i++) - desc[i] = 0.f; - - // Calculate orientation histogram - for (int l = 0; l < len*len; l++) { - int i = l / len - radius; - int j = l % len - radius; - - int y = fy + i; - int x = fx + j; - - float x_rot = (j * cos_t - i * sin_t) / hist_width; - float y_rot = (j * sin_t + i * cos_t) / hist_width; - float xbin = x_rot + d/2 - 0.5f; - float ybin = y_rot + d/2 - 0.5f; - - if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d && - y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) { - float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); - float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); - - float grad_mag = sqrt(dx*dx + dy*dy); - float grad_ori = atan2(dy, dx) - ori; - while (grad_ori < 0.0f) - grad_ori += PI_VAL*2; - while (grad_ori >= PI_VAL*2) - grad_ori -= PI_VAL*2; - - float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom); - float obin = grad_ori * bins_per_rad; - float mag = grad_mag*w; - - int x0 = floor(xbin); - int y0 = floor(ybin); - int o0 = floor(obin); - xbin -= x0; - ybin -= y0; - obin -= o0; - - for (int yl = 0; yl <= 1; yl++) { - int yb = y0 + yl; - if (yb >= 0 && yb < d) { - float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin); - for (int xl = 0; xl <= 1; xl++) { - int xb = x0 + xl; - if (xb >= 0 && xb < d) { - float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin); - for (int ol = 0; ol <= 1; ol++) { - int ob = (o0 + ol) % n; - float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin); - desc[(yb*d + xb)*n + ob] += v_o; - } - } - } - } - } - } - } - - normalizeDesc(desc, desc_len); - - for (int i = 0; i < (int)desc_len; i++) - desc[i] = min(desc[i], DescrMagThr); - - normalizeDesc(desc, desc_len); - - // Calculate final descriptor values - for (int k = 0; k < (int)desc_len; k++) { - desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr)); - } - } - } - -// Computes GLOH feature descriptors for features in an array. Based on Section III-B -// of Mikolajczyk and Schmid paper. - template - void computeGLOHDescriptor( - float* desc_out, - const unsigned desc_len, - const float* x_in, - const float* y_in, - const unsigned* layer_in, - const float* response_in, - const float* size_in, - const float* ori_in, - const unsigned total_feat, - const std::vector< Array >& gauss_pyr, - const int d, - const unsigned rb, - const unsigned ab, - const unsigned hb, - const float scale, - const unsigned octave, - const unsigned n_layers) - { - float desc[272]; - - for (unsigned f = 0; f < total_feat; f++) { - const unsigned layer = layer_in[f]; - float ori = (360.f - ori_in[f]) * PI_VAL / 180.f; - ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori; - const float size = size_in[f]; - const int fx = round(x_in[f] * scale); - const int fy = round(y_in[f] * scale); - - // Points img to correct Gaussian pyramid layer - Array img = gauss_pyr[octave*(n_layers+3) + layer]; - const T* img_ptr = img.get(); - af::dim4 idims = img.dims(); - - float cos_t = cos(ori); - float sin_t = sin(ori); - float hist_bins_per_rad = hb / (PI_VAL * 2.f); - float polar_bins_per_rad = ab / (PI_VAL * 2.f); - float exp_denom = GLOHRadii[rb-1] * 0.5f; - - float hist_width = DescrSclFctr * size * scale * 0.5f; - - // Keep same descriptor radius used for SIFT - int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f; - - // Alternative radius size calculation, changing the radius weight - // (rw) in the range of 0.25f-0.75f gives different results, - // increasing it tends to show a better recall rate but with a - // smaller amount of correct matches - //float rw = 0.5f; - //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f; - - int len = radius*2+1; - - for (int i = 0; i < (int)desc_len; i++) - desc[i] = 0.f; - - // Calculate orientation histogram - for (int l = 0; l < len*len; l++) { - int i = l / len - radius; - int j = l % len - radius; - - int y = fy + i; - int x = fx + j; - - float x_rot = (j * cos_t - i * sin_t); - float y_rot = (j * sin_t + i * cos_t); - - float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1]; - float theta = atan2(y_rot, x_rot); - while (theta < 0.0f) - theta += PI_VAL*2; - while (theta >= PI_VAL*2) - theta -= PI_VAL*2; - - float tbin = theta * polar_bins_per_rad; - float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] : - ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) : - min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON)); - - if (r <= GLOHRadii[rb-1] && - y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) { - float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); - float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); - - float grad_mag = sqrt(dx*dx + dy*dy); - float grad_ori = atan2(dy, dx) - ori; - while (grad_ori < 0.0f) - grad_ori += PI_VAL*2; - while (grad_ori >= PI_VAL*2) - grad_ori -= PI_VAL*2; - - float w = exp(-r / exp_denom); - float obin = grad_ori * hist_bins_per_rad; - float mag = grad_mag*w; - - int t0 = floor(tbin); - int r0 = floor(rbin); - int o0 = floor(obin); - tbin -= t0; - rbin -= r0; - obin -= o0; - - for (int rl = 0; rl <= 1; rl++) { - int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl); - float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin); - if (rb >= 0 && rb <= 2) { - for (int tl = 0; tl <= 1; tl++) { - int tb = (t0 + tl) % ab; - float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin); - for (int ol = 0; ol <= 1; ol++) { - int ob = (o0 + ol) % hb; - float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin); - unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob; - desc[idx] += v_o; - } - } - } - } - } - } - - normalizeDesc(desc, desc_len); - - for (int i = 0; i < (int)desc_len; i++) - desc[i] = min(desc[i], DescrMagThr); - - normalizeDesc(desc, desc_len); - - // Calculate final descriptor values - for (int k = 0; k < (int)desc_len; k++) { - desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr)); - } - } - } - -#undef IPTR - - template - Array createInitialImage( - const Array& img, - const float init_sigma, - const bool double_input) - { - af::dim4 idims = img.dims(); - - Array init_img = createEmptyArray(af::dim4()); - - float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f) - : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f); - - Array filter = gauss_filter(s); - - if (double_input) { - Array double_img = resize(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR); - init_img = convolve2(double_img, filter, filter); - } - else { - init_img = convolve2(img, filter, filter); - } - - return init_img; - } - - template - std::vector< Array > buildGaussPyr( - const Array& init_img, - const unsigned n_octaves, - const unsigned n_layers, - const float init_sigma) - { - // Precompute Gaussian sigmas using the following formula: - // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2 - std::vector sig_layers(n_layers + 3); - sig_layers[0] = init_sigma; - float k = std::pow(2.0f, 1.0f / n_layers); - for (unsigned i = 1; i < n_layers + 3; i++) { - float sig_prev = std::pow(k, i-1) * init_sigma; - float sig_total = sig_prev * k; - sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev); - } - - // Gaussian Pyramid - std::vector< Array > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray(af::dim4())); - for (unsigned o = 0; o < n_octaves; o++) { - for (unsigned l = 0; l < n_layers+3; l++) { - unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1; - unsigned idx = o*(n_layers+3) + l; - - if (o == 0 && l == 0) { - gauss_pyr[idx] = init_img; - } - else if (l == 0) { - af::dim4 sdims = gauss_pyr[src_idx].dims(); - gauss_pyr[idx] = resize(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR); - } - else { - Array filter = gauss_filter(sig_layers[l]); - - gauss_pyr[idx] = convolve2(gauss_pyr[src_idx], filter, filter); - } - } - } - - return gauss_pyr; - } - - template - std::vector< Array > buildDoGPyr( - std::vector< Array >& gauss_pyr, - const unsigned n_octaves, - const unsigned n_layers) - { - // DoG Pyramid - std::vector< Array > dog_pyr(n_octaves * (n_layers+2), createEmptyArray(af::dim4())); - for (unsigned o = 0; o < n_octaves; o++) { - for (unsigned l = 0; l < n_layers+2; l++) { - unsigned idx = o*(n_layers+2) + l; - unsigned bottom = o*(n_layers+3) + l; - unsigned top = o*(n_layers+3) + l+1; - - dog_pyr[idx] = createEmptyArray(gauss_pyr[bottom].dims()); - - sub(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]); - } - } - - return dog_pyr; - } - - - template - unsigned sift_impl(Array& x, Array& y, Array& score, - Array& ori, Array& size, Array& desc, - const Array& in, const unsigned n_layers, - const float contrast_thr, const float edge_thr, - const float init_sigma, const bool double_input, - const float img_scale, const float feature_ratio, - const bool compute_GLOH) - { - af::dim4 idims = in.dims(); - - const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2) - : min(idims[0], idims[1]); - const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2; - - Array init_img = createInitialImage(in, init_sigma, double_input); - - std::vector< Array > gauss_pyr = buildGaussPyr(init_img, n_octaves, n_layers, init_sigma); - - std::vector< Array > dog_pyr = buildDoGPyr(gauss_pyr, n_octaves, n_layers); - - std::vector x_pyr(n_octaves, NULL); - std::vector y_pyr(n_octaves, NULL); - std::vector response_pyr(n_octaves, NULL); - std::vector size_pyr(n_octaves, NULL); - std::vector ori_pyr(n_octaves, NULL); - std::vector desc_pyr(n_octaves, NULL); - std::vector feat_pyr(n_octaves, 0); - unsigned total_feat = 0; - - const unsigned d = DescrWidth; - const unsigned n = DescrHistBins; - const unsigned rb = GLOHRadialBins; - const unsigned ab = GLOHAngularBins; - const unsigned hb = GLOHHistBins; - const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n; - - for (unsigned i = 0; i < n_octaves; i++) { - af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims(); - if (ddims[0]-2*ImgBorder < 1 || - ddims[1]-2*ImgBorder < 1) - continue; - - const unsigned imel = ddims[0] * ddims[1]; - const unsigned max_feat = ceil(imel * feature_ratio); - - float* extrema_x = memAlloc(max_feat); - float* extrema_y = memAlloc(max_feat); - unsigned* extrema_layer = memAlloc(max_feat); - unsigned extrema_feat = 0; - - for (unsigned j = 1; j <= n_layers; j++) { - unsigned prev = i*(n_layers+2) + j-1; - unsigned center = i*(n_layers+2) + j; - unsigned next = i*(n_layers+2) + j+1; - - unsigned layer = j; - - float extrema_thr = 0.5f * contrast_thr / n_layers; - detectExtrema(extrema_x, extrema_y, extrema_layer, &extrema_feat, - dog_pyr[prev], dog_pyr[center], dog_pyr[next], - layer, max_feat, extrema_thr); - } - - extrema_feat = min(extrema_feat, max_feat); - - if (extrema_feat == 0) { - memFree(extrema_x); - memFree(extrema_y); - memFree(extrema_layer); - - continue; - } - - unsigned interp_feat = 0; - - float* interp_x = memAlloc(extrema_feat); - float* interp_y = memAlloc(extrema_feat); - unsigned* interp_layer = memAlloc(extrema_feat); - float* interp_response = memAlloc(extrema_feat); - float* interp_size = memAlloc(extrema_feat); - - interpolateExtrema(interp_x, interp_y, interp_layer, - interp_response, interp_size, &interp_feat, - extrema_x, extrema_y, extrema_layer, extrema_feat, - dog_pyr, max_feat, i, n_layers, - contrast_thr, edge_thr, init_sigma, img_scale); - - interp_feat = min(interp_feat, max_feat); - - if (interp_feat == 0) { - memFree(interp_x); - memFree(interp_y); - memFree(interp_layer); - memFree(interp_response); - memFree(interp_size); - - continue; - } - - std::vector sorted_feat; - array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat); - std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp); - - memFree(interp_x); - memFree(interp_y); - memFree(interp_layer); - memFree(interp_response); - memFree(interp_size); - - unsigned nodup_feat = 0; - - float* nodup_x = memAlloc(interp_feat); - float* nodup_y = memAlloc(interp_feat); - unsigned* nodup_layer = memAlloc(interp_feat); - float* nodup_response = memAlloc(interp_feat); - float* nodup_size = memAlloc(interp_feat); - - removeDuplicates(nodup_x, nodup_y, nodup_layer, - nodup_response, nodup_size, &nodup_feat, - sorted_feat); - - const unsigned max_oriented_feat = nodup_feat * 3; - - float* oriented_x = memAlloc(max_oriented_feat); - float* oriented_y = memAlloc(max_oriented_feat); - unsigned* oriented_layer = memAlloc(max_oriented_feat); - float* oriented_response = memAlloc(max_oriented_feat); - float* oriented_size = memAlloc(max_oriented_feat); - float* oriented_ori = memAlloc(max_oriented_feat); - - unsigned oriented_feat = 0; - - calcOrientation(oriented_x, oriented_y, oriented_layer, - oriented_response, oriented_size, oriented_ori, &oriented_feat, - nodup_x, nodup_y, nodup_layer, - nodup_response, nodup_size, nodup_feat, - gauss_pyr, max_oriented_feat, i, n_layers, double_input); - - memFree(nodup_x); - memFree(nodup_y); - memFree(nodup_layer); - memFree(nodup_response); - memFree(nodup_size); - - if (oriented_feat == 0) { - memFree(oriented_x); - memFree(oriented_y); - memFree(oriented_layer); - memFree(oriented_response); - memFree(oriented_size); - memFree(oriented_ori); - - continue; - } - - float* desc = memAlloc(oriented_feat * desc_len); - - float scale = 1.f/(1 << i); - if (double_input) scale *= 2.f; - - if (compute_GLOH) - computeGLOHDescriptor(desc, desc_len, - oriented_x, oriented_y, oriented_layer, - oriented_response, oriented_size, oriented_ori, - oriented_feat, gauss_pyr, d, rb, ab, hb, - scale, i, n_layers); - else - computeDescriptor(desc, desc_len, - oriented_x, oriented_y, oriented_layer, - oriented_response, oriented_size, oriented_ori, - oriented_feat, gauss_pyr, d, n, scale, i, n_layers); - - total_feat += oriented_feat; - feat_pyr[i] = oriented_feat; - - if (oriented_feat > 0) { - x_pyr[i] = oriented_x; - y_pyr[i] = oriented_y; - response_pyr[i] = oriented_response; - ori_pyr[i] = oriented_ori; - size_pyr[i] = oriented_size; - desc_pyr[i] = desc; - } - } - - if (total_feat > 0) { - const af::dim4 total_feat_dims(total_feat); - const af::dim4 desc_dims(desc_len, total_feat); - - // Allocate output memory - x = createEmptyArray(total_feat_dims); - y = createEmptyArray(total_feat_dims); - score = createEmptyArray(total_feat_dims); - ori = createEmptyArray(total_feat_dims); - size = createEmptyArray(total_feat_dims); - desc = createEmptyArray(desc_dims); - - float* x_ptr = x.get(); - float* y_ptr = y.get(); - float* score_ptr = score.get(); - float* ori_ptr = ori.get(); - float* size_ptr = size.get(); - float* desc_ptr = desc.get(); - - unsigned offset = 0; - for (unsigned i = 0; i < n_octaves; i++) { - if (feat_pyr[i] == 0) - continue; - - memcpy(x_ptr+offset, x_pyr[i], feat_pyr[i] * sizeof(float)); - memcpy(y_ptr+offset, y_pyr[i], feat_pyr[i] * sizeof(float)); - memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float)); - memcpy(ori_ptr+offset, ori_pyr[i], feat_pyr[i] * sizeof(float)); - memcpy(size_ptr+offset, size_pyr[i], feat_pyr[i] * sizeof(float)); - - memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float)); - - memFree(x_pyr[i]); - memFree(y_pyr[i]); - memFree(response_pyr[i]); - memFree(ori_pyr[i]); - memFree(size_pyr[i]); - memFree(desc_pyr[i]); - - offset += feat_pyr[i]; - } - } - - return total_feat; - } -} diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp index 3c6b1740d5..5ece9bf65e 100644 --- a/src/backend/cpu/sobel.cpp +++ b/src/backend/cpu/sobel.cpp @@ -13,88 +13,32 @@ #include #include #include -#include -#include +#include +#include +#include using af::dim4; namespace cpu { -template -void derivative(To *optr, Ti const *iptr, dim4 const &dims, dim4 const &strides) -{ - for(dim_t b3=0; b3=0 && _joff>=0) ? - iptr[_joff*strides[1]+_ioff*strides[0]] : 0; - To SW = (ioff_<(int)dims[0] && _joff>=0) ? - iptr[_joff*strides[1]+ioff_*strides[0]] : 0; - To NE = (_ioff>=0 && joff_<(int)dims[1]) ? - iptr[joff_*strides[1]+_ioff*strides[0]] : 0; - To SE = (ioff_<(int)dims[0] && joff_<(int)dims[1]) ? - iptr[joff_*strides[1]+ioff_*strides[0]] : 0; - - if (isDX) { - To W = _joff>=0 ? - iptr[_joff*strides[1]+ioff*strides[0]] : 0; - - To E = joff_<(int)dims[1] ? - iptr[joff_*strides[1]+ioff*strides[0]] : 0; - - accum = NW+SW - (NE+SE) + 2*(W-E); - } else { - To N = _ioff>=0 ? - iptr[joff*strides[1]+_ioff*strides[0]] : 0; - - To S = ioff_<(int)dims[0] ? - iptr[joff*strides[1]+ioff_*strides[0]] : 0; - - accum = NW+NE - (SW+SE) + 2*(N-S); - } - - optr[joffset+i*strides[0]] = accum; - } - } - - optr += strides[2]; - iptr += strides[2]; - } - optr += strides[3]; - iptr += strides[3]; - } -} - template std::pair< Array, Array > sobelDerivatives(const Array &img, const unsigned &ker_size) { + img.eval(); + // ket_size is for future proofing, this argument is not used + // currently Array dx = createEmptyArray(img.dims()); Array dy = createEmptyArray(img.dims()); - derivative(dx.get(), img.get(), img.dims(), img.strides()); - derivative(dy.get(), img.get(), img.dims(), img.strides()); + getQueue().enqueue(kernel::derivative, dx, img); + getQueue().enqueue(kernel::derivative, dy, img); return std::make_pair(dx, dy); } -#define INSTANTIATE(Ti, To) \ +#define INSTANTIATE(Ti, To) \ template std::pair< Array, Array > \ sobelDerivatives(const Array &img, const unsigned &ker_size); diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp index 1e88e8d915..367afa3884 100644 --- a/src/backend/cpu/solve.cpp +++ b/src/backend/cpu/solve.cpp @@ -11,52 +11,40 @@ #include #if defined(WITH_CPU_LINEAR_ALGEBRA) - #include #include -#include -#include #include #include - #include +#include +#include namespace cpu { template using gesv_func_def = int (*)(ORDER_TYPE, int, int, - T *, int, - int *, - T *, int); + T *, int, int *, T *, int); template -using gels_func_def = int (*)(ORDER_TYPE, char, - int, int, int, - T *, int, - T *, int); +using gels_func_def = int (*)(ORDER_TYPE, char, int, int, int, + T *, int, T *, int); template -using getrs_func_def = int (*)(ORDER_TYPE, char, - int, int, - const T *, int, - const int *, - T *, int); +using getrs_func_def = int (*)(ORDER_TYPE, char, int, int, + const T *, int, const int *, T *, int); template -using trtrs_func_def = int (*)(ORDER_TYPE, - char, char, char, - int, int, - const T *, int, - T *, int); +using trtrs_func_def = int (*)(ORDER_TYPE, char, char, char, int, int, + const T *, int, T *, int); -#define SOLVE_FUNC_DEF( FUNC ) \ +#define SOLVE_FUNC_DEF( FUNC ) \ template FUNC##_func_def FUNC##_func(); -#define SOLVE_FUNC( FUNC, TYPE, PREFIX ) \ -template<> FUNC##_func_def FUNC##_func() \ +#define SOLVE_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ { return & LAPACK_NAME(PREFIX##FUNC); } SOLVE_FUNC_DEF( gesv ) @@ -87,16 +75,20 @@ template Array solveLU(const Array &A, const Array &pivot, const Array &b, const af_mat_prop options) { - int N = A.dims()[0]; - int NRHS = b.dims()[1]; + A.eval(); + pivot.eval(); + b.eval(); + int N = A.dims()[0]; + int NRHS = b.dims()[1]; Array< T > B = copyArray(b); - getrs_func()(AF_LAPACK_COL_MAJOR, 'N', - N, NRHS, - A.get(), A.strides()[1], - pivot.get(), - B.get(), B.strides()[1]); + auto func = [=] (Array A, Array B, Array pivot, int N, int NRHS) { + getrs_func()(AF_LAPACK_COL_MAJOR, 'N', + N, NRHS, A.get(), A.strides()[1], + pivot.get(), B.get(), B.strides()[1]); + }; + getQueue().enqueue(func, A, B, pivot, N, NRHS); return B; } @@ -104,17 +96,24 @@ Array solveLU(const Array &A, const Array &pivot, template Array triangleSolve(const Array &A, const Array &b, const af_mat_prop options) { + A.eval(); + b.eval(); + Array B = copyArray(b); - int N = B.dims()[0]; - int NRHS = B.dims()[1]; - - trtrs_func()(AF_LAPACK_COL_MAJOR, - options & AF_MAT_UPPER ? 'U' : 'L', - 'N', // transpose flag - options & AF_MAT_DIAG_UNIT ? 'U' : 'N', - N, NRHS, - A.get(), A.strides()[1], - B.get(), B.strides()[1]); + int N = B.dims()[0]; + int NRHS = B.dims()[1]; + + auto func = [=] (Array A, Array B, int N, int NRHS, const af_mat_prop options) { + trtrs_func()(AF_LAPACK_COL_MAJOR, + options & AF_MAT_UPPER ? 'U' : 'L', + 'N', // transpose flag + options & AF_MAT_DIAG_UNIT ? 'U' : 'N', + N, NRHS, + A.get(), A.strides()[1], + B.get(), B.strides()[1]); + }; + getQueue().enqueue(func, A, B, N, NRHS, options); + return B; } @@ -122,9 +121,10 @@ Array triangleSolve(const Array &A, const Array &b, const af_mat_prop o template Array solve(const Array &a, const Array &b, const af_mat_prop options) { + a.eval(); + b.eval(); - if (options & AF_MAT_UPPER || - options & AF_MAT_LOWER) { + if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) { return triangleSolve(a, b, options); } @@ -132,41 +132,34 @@ Array solve(const Array &a, const Array &b, const af_mat_prop options) int N = a.dims()[1]; int K = b.dims()[1]; - Array A = copyArray(a); Array B = padArray(b, dim4(max(M, N), K)); if(M == N) { Array pivot = createEmptyArray(dim4(N, 1, 1)); - gesv_func()(AF_LAPACK_COL_MAJOR, N, K, - A.get(), A.strides()[1], - pivot.get(), - B.get(), B.strides()[1]); + + auto func = [=] (Array A, Array B, Array pivot, int N, int K) { + gesv_func()(AF_LAPACK_COL_MAJOR, N, K, A.get(), A.strides()[1], + pivot.get(), B.get(), B.strides()[1]); + }; + getQueue().enqueue(func, A, B, pivot, N, K); } else { - int sM = a.strides()[1]; - int sN = a.strides()[2] / sM; + auto func = [=] (Array A, Array B, int M, int N, int K) { + int sM = A.strides()[1]; + int sN = A.strides()[2] / sM; - gels_func()(AF_LAPACK_COL_MAJOR, 'N', - M, N, K, - A.get(), A.strides()[1], - B.get(), max(sM, sN)); + gels_func()(AF_LAPACK_COL_MAJOR, 'N', + M, N, K, + A.get(), A.strides()[1], + B.get(), max(sM, sN)); + }; B.resetDims(dim4(N, K)); + getQueue().enqueue(func, A, B, M, N, K); } return B; } -#define INSTANTIATE_SOLVE(T) \ - template Array solve(const Array &a, const Array &b, \ - const af_mat_prop options); \ - template Array solveLU(const Array &A, const Array &pivot, \ - const Array &b, const af_mat_prop options); \ - -INSTANTIATE_SOLVE(float) -INSTANTIATE_SOLVE(cfloat) -INSTANTIATE_SOLVE(double) -INSTANTIATE_SOLVE(cdouble) - } #else @@ -178,17 +171,22 @@ template Array solveLU(const Array &A, const Array &pivot, const Array &b, const af_mat_prop options) { - AF_ERROR("Linear Algebra is diabled on CPU", - AF_ERR_NOT_CONFIGURED); + AF_ERROR("Linear Algebra is diabled on CPU", AF_ERR_NOT_CONFIGURED); } template Array solve(const Array &a, const Array &b, const af_mat_prop options) { - AF_ERROR("Linear Algebra is diabled on CPU", - AF_ERR_NOT_CONFIGURED); + AF_ERROR("Linear Algebra is diabled on CPU", AF_ERR_NOT_CONFIGURED); +} + } +#endif + +namespace cpu +{ + #define INSTANTIATE_SOLVE(T) \ template Array solve(const Array &a, const Array &b, \ const af_mat_prop options); \ @@ -199,6 +197,5 @@ INSTANTIATE_SOLVE(float) INSTANTIATE_SOLVE(cfloat) INSTANTIATE_SOLVE(double) INSTANTIATE_SOLVE(cdouble) -} -#endif +} diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index 0b3fb9aabe..bc6396b258 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -11,78 +11,43 @@ #include #include #include -#include -#include #include #include - -using std::greater; -using std::less; -using std::sort; -using std::function; +#include +#include +#include namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - - // Based off of http://stackoverflow.com/a/12399290 - template - void sort0(Array &val) - { - // initialize original index locations - T *val_ptr = val.get(); - - function op = greater(); - if(isAscending) { op = less(); } - - T *comp_ptr = nullptr; - for(dim_t w = 0; w < val.dims()[3]; w++) { - dim_t valW = w * val.strides()[3]; - for(dim_t z = 0; z < val.dims()[2]; z++) { - dim_t valWZ = valW + z * val.strides()[2]; - for(dim_t y = 0; y < val.dims()[1]; y++) { - - dim_t valOffset = valWZ + y * val.strides()[1]; - comp_ptr = val_ptr + valOffset; - std::sort(comp_ptr, comp_ptr + val.dims()[0], op); - } - } - } - return; - } +template +Array sort(const Array &in, const unsigned dim) +{ + in.eval(); - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - Array sort(const Array &in, const unsigned dim) - { - Array out = copyArray(in); - switch(dim) { - case 0: sort0(out); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); - } - return out; + Array out = copyArray(in); + switch(dim) { + case 0: getQueue().enqueue(kernel::sort0, out); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + return out; +} #define INSTANTIATE(T) \ template Array sort(const Array &in, const unsigned dim); \ template Array sort(const Array &in, const unsigned dim); \ - INSTANTIATE(float) - INSTANTIATE(double) - //INSTANTIATE(cfloat) - //INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) - INSTANTIATE(intl) - INSTANTIATE(uintl) +INSTANTIATE(float) +INSTANTIATE(double) +//INSTANTIATE(cfloat) +//INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) +INSTANTIATE(intl) +INSTANTIATE(uintl) + } diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index 4b0a092834..5a99257033 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -9,103 +9,31 @@ #include #include -#include -#include -#include -#include -#include -#include -#include - -using std::greater; -using std::less; -using std::sort; -using std::function; -using std::queue; -using std::future; -using std::async; +#include +#include +#include namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - - template - void sort0_by_key(Array &okey, Array &oval, const Array &ikey, const Array &ival) - { - function op = greater(); - if(isAscending) { op = less(); } - - // Get pointers and initialize original index locations - Array oidx = createValueArray(ikey.dims(), 0u); - uint *oidx_ptr = oidx.get(); - Tk *okey_ptr = okey.get(); - Tv *oval_ptr = oval.get(); - const Tk *ikey_ptr = ikey.get(); - const Tv *ival_ptr = ival.get(); - - std::vector seq_vec(oidx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); - - const Tk *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < ikey.dims()[3]; w++) { - dim_t okeyW = w * okey.strides()[3]; - dim_t ovalW = w * oval.strides()[3]; - dim_t oidxW = w * oidx.strides()[3]; - dim_t ikeyW = w * ikey.strides()[3]; - dim_t ivalW = w * ival.strides()[3]; - - for(dim_t z = 0; z < ikey.dims()[2]; z++) { - dim_t okeyWZ = okeyW + z * okey.strides()[2]; - dim_t ovalWZ = ovalW + z * oval.strides()[2]; - dim_t oidxWZ = oidxW + z * oidx.strides()[2]; - dim_t ikeyWZ = ikeyW + z * ikey.strides()[2]; - dim_t ivalWZ = ivalW + z * ival.strides()[2]; - - for(dim_t y = 0; y < ikey.dims()[1]; y++) { - - dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; - dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; - dim_t oidxOffset = oidxWZ + y * oidx.strides()[1]; - dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1]; - dim_t ivalOffset = ivalWZ + y * ival.strides()[1]; - uint *ptr = oidx_ptr + oidxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); - - comp_ptr = ikey_ptr + ikeyOffset; - std::stable_sort(ptr, ptr + ikey.dims()[0], comparator); - - for (dim_t i = 0; i < oval.dims()[0]; ++i){ - uint sortIdx = oidx_ptr[oidxOffset + i]; - okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx]; - oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx]; - } - } - } - } - - return; - } - - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - void sort_by_key(Array &okey, Array &oval, - const Array &ikey, const Array &ival, const uint dim) - { - okey = createEmptyArray(ikey.dims()); - oval = createEmptyArray(ival.dims()); - switch(dim) { - case 0: sort0_by_key(okey, oval, ikey, ival); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); - } +template +void sort_by_key(Array &okey, Array &oval, + const Array &ikey, const Array &ival, const uint dim) +{ + ikey.eval(); + ival.eval(); + + okey = createEmptyArray(ikey.dims()); + oval = createEmptyArray(ival.dims()); + Array oidx = createValueArray(ikey.dims(), 0u); + oidx.eval(); + + switch(dim) { + case 0: getQueue().enqueue(kernel::sort0_by_key, + okey, oval, oidx, ikey, ival); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } +} #define INSTANTIATE(Tk, Tv) \ template void \ @@ -128,14 +56,15 @@ namespace cpu INSTANTIATE(Tk, uintl) \ - INSTANTIATE1(float) - INSTANTIATE1(double) - INSTANTIATE1(int) - INSTANTIATE1(uint) - INSTANTIATE1(char) - INSTANTIATE1(uchar) - INSTANTIATE1(short) - INSTANTIATE1(ushort) - INSTANTIATE1(intl) - INSTANTIATE1(uintl) +INSTANTIATE1(float) +INSTANTIATE1(double) +INSTANTIATE1(int) +INSTANTIATE1(uint) +INSTANTIATE1(char) +INSTANTIATE1(uchar) +INSTANTIATE1(short) +INSTANTIATE1(ushort) +INSTANTIATE1(intl) +INSTANTIATE1(uintl) + } diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index eb6b4bee60..77860ede18 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -10,86 +10,27 @@ #include #include #include -#include -#include #include #include -#include -#include - -using std::greater; -using std::less; -using std::sort; -using std::function; -using std::queue; -using std::future; -using std::async; +#include +#include +#include namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - template - void sort0_index(Array &val, Array &idx, const Array &in) - { - // initialize original index locations - uint *idx_ptr = idx.get(); - T *val_ptr = val.get(); - const T *in_ptr = in.get(); - function op = greater(); - if(isAscending) { op = less(); } - - std::vector seq_vec(idx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); - - const T *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < in.dims()[3]; w++) { - dim_t valW = w * val.strides()[3]; - dim_t idxW = w * idx.strides()[3]; - dim_t inW = w * in.strides()[3]; - for(dim_t z = 0; z < in.dims()[2]; z++) { - dim_t valWZ = valW + z * val.strides()[2]; - dim_t idxWZ = idxW + z * idx.strides()[2]; - dim_t inWZ = inW + z * in.strides()[2]; - for(dim_t y = 0; y < in.dims()[1]; y++) { - - dim_t valOffset = valWZ + y * val.strides()[1]; - dim_t idxOffset = idxWZ + y * idx.strides()[1]; - dim_t inOffset = inWZ + y * in.strides()[1]; - - uint *ptr = idx_ptr + idxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); - comp_ptr = in_ptr + inOffset; - std::stable_sort(ptr, ptr + in.dims()[0], comparator); - - for (dim_t i = 0; i < val.dims()[0]; ++i){ - val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]]; - } - } - } - } - - return; - } +template +void sort_index(Array &val, Array &idx, const Array &in, const uint dim) +{ + in.eval(); - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - void sort_index(Array &val, Array &idx, const Array &in, const uint dim) - { - val = createEmptyArray(in.dims()); - idx = createEmptyArray(in.dims()); - switch(dim) { - case 0: sort0_index(val, idx, in); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); - } + val = createEmptyArray(in.dims()); + idx = createEmptyArray(in.dims()); + switch(dim) { + case 0: getQueue().enqueue(kernel::sort0_index, val, idx, in); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } +} #define INSTANTIATE(T) \ template void sort_index(Array &val, Array &idx, const Array &in, \ @@ -97,16 +38,17 @@ namespace cpu template void sort_index(Array &val, Array &idx, const Array &in, \ const uint dim); \ - INSTANTIATE(float) - INSTANTIATE(double) - //INSTANTIATE(cfloat) - //INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) - INSTANTIATE(intl) - INSTANTIATE(uintl) +INSTANTIATE(float) +INSTANTIATE(double) +//INSTANTIATE(cfloat) +//INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) +INSTANTIATE(intl) +INSTANTIATE(uintl) + } diff --git a/src/backend/cpu/surface.cpp b/src/backend/cpu/surface.cpp index 39f375a6fe..24c945c20b 100644 --- a/src/backend/cpu/surface.cpp +++ b/src/backend/cpu/surface.cpp @@ -12,37 +12,40 @@ #include #include #include -#include #include -#include -#include +#include +#include using af::dim4; namespace cpu { - template - void copy_surface(const Array &P, fg::Surface* surface) - { - CheckGL("Before CopyArrayToVBO"); - - glBindBuffer(GL_ARRAY_BUFFER, surface->vbo()); - glBufferSubData(GL_ARRAY_BUFFER, 0, surface->size(), P.get()); - glBindBuffer(GL_ARRAY_BUFFER, 0); - - CheckGL("In CopyArrayToVBO"); - } - - #define INSTANTIATE(T) \ - template void copy_surface(const Array &P, fg::Surface* surface); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) + +template +void copy_surface(const Array &P, fg::Surface* surface) +{ + P.eval(); + getQueue().sync(); + CheckGL("Before CopyArrayToVBO"); + + glBindBuffer(GL_ARRAY_BUFFER, surface->vbo()); + glBufferSubData(GL_ARRAY_BUFFER, 0, surface->size(), P.get()); + glBindBuffer(GL_ARRAY_BUFFER, 0); + + CheckGL("In CopyArrayToVBO"); +} + +#define INSTANTIATE(T) \ + template void copy_surface(const Array &P, fg::Surface* surface); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) + } #endif // WITH_GRAPHICS diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp index 77493915c0..55a2357206 100644 --- a/src/backend/cpu/susan.cpp +++ b/src/backend/cpu/susan.cpp @@ -11,119 +11,54 @@ #include #include #include +#include +#include +#include +#include using af::features; +using std::shared_ptr; namespace cpu { -template -void susan_responses(T* resp_out, const T* in, - const unsigned idim0, const unsigned idim1, - const int radius, const float t, const float g, - const unsigned border_len) -{ - const unsigned r = border_len; - const int rSqrd = radius*radius; - - for (unsigned y = r; y < idim1 - r; ++y) { - for (unsigned x = r; x < idim0 - r; ++x) { - const unsigned idx = y * idim0 + x; - T m_0 = in[idx]; - float nM = 0.0f; - - for (int i=-radius; i<=radius; ++i) { - for (int j=-radius; j<=radius; ++j) { - if (i*i + j*j < rSqrd) { - int p = x + i; - int q = y + j; - T m = in[p + idim0 * q]; - float exp_pow = std::pow((m - m_0)/t, 6.0); - float cM = std::exp(-exp_pow); - nM += cM; - } - } - } - - resp_out[idx] = nM < g ? g - nM : T(0); - } - } -} - -template -void non_maximal(float* x_out, float* y_out, float* resp_out, - unsigned* count, const unsigned idim0, const unsigned idim1, - const T* resp_in, const unsigned border_len, const unsigned max_corners) -{ - // Responses on the border don't have 8-neighbors to compare, discard them - const unsigned r = border_len + 1; - - for (unsigned y = r; y < idim1 - r; y++) { - for (unsigned x = r; x < idim0 - r; x++) { - const T v = resp_in[y * idim0 + x]; - - // Find maximum neighborhood response - T max_v; - max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]); - max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]); - max_v = max(max_v, resp_in[(y-1) * idim0 + x ]); - max_v = max(max_v, resp_in[(y+1) * idim0 + x ]); - max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]); - max_v = max(max_v, resp_in[(y) * idim0 + x+1]); - max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]); - - // Stores corner to {x,y,resp}_out if it's response is maximum compared - // to its 8-neighborhood and greater or equal minimum response - if (v > max_v) { - const unsigned idx = *count; - *count += 1; - if (idx < max_corners) { - x_out[idx] = (float)x; - y_out[idx] = (float)y; - resp_out[idx] = (float)v; - } - } - } - } -} - template unsigned susan(Array &x_out, Array &y_out, Array &resp_out, const Array &in, const unsigned radius, const float diff_thr, const float geom_thr, const float feature_ratio, const unsigned edge) { - dim4 idims = in.dims(); + in.eval(); + dim4 idims = in.dims(); const unsigned corner_lim = in.elements() * feature_ratio; - float* x_corners = memAlloc(corner_lim); - float* y_corners = memAlloc(corner_lim); - float* resp_corners = memAlloc(corner_lim); - - T* resp = memAlloc(in.elements()); - unsigned corners_found = 0; - susan_responses(resp, in.get(), idims[0], idims[1], radius, diff_thr, geom_thr, edge); + auto x_corners = createEmptyArray(dim4(corner_lim)); + auto y_corners = createEmptyArray(dim4(corner_lim)); + auto resp_corners = createEmptyArray(dim4(corner_lim)); + auto response = createEmptyArray(dim4(in.elements())); + auto corners_found= std::shared_ptr(memAlloc(1), memFree); + corners_found.get()[0] = 0; - non_maximal(x_corners, y_corners, resp_corners, &corners_found, - idims[0], idims[1], resp, edge, corner_lim); + getQueue().enqueue(kernel::susan_responses, response, in, idims[0], idims[1], + radius, diff_thr, geom_thr, edge); + getQueue().enqueue(kernel::non_maximal, x_corners, y_corners, resp_corners, corners_found, + idims[0], idims[1], response, edge, corner_lim); + getQueue().sync(); - memFree(resp); - - const unsigned corners_out = min(corners_found, corner_lim); + const unsigned corners_out = min((corners_found.get())[0], corner_lim); if (corners_out == 0) { - memFree(x_corners); - memFree(y_corners); - memFree(resp_corners); x_out = createEmptyArray(dim4()); y_out = createEmptyArray(dim4()); resp_out = createEmptyArray(dim4()); return 0; } else { - - x_out = createDeviceDataArray(dim4(corners_out), (void*)x_corners); - y_out = createDeviceDataArray(dim4(corners_out), (void*)y_corners); - resp_out = createDeviceDataArray(dim4(corners_out), (void*)resp_corners); + x_out = x_corners; + y_out = y_corners; + resp_out = resp_corners; + x_out.resetDims(dim4(corners_out)); + y_out.resetDims(dim4(corners_out)); + resp_out.resetDims(dim4(corners_out)); return corners_out; } } diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp index 461b9014aa..2ac58aab3f 100644 --- a/src/backend/cpu/svd.cpp +++ b/src/backend/cpu/svd.cpp @@ -10,12 +10,13 @@ #include #include #include - #include #if defined(WITH_CPU_LINEAR_ALGEBRA) #include #include +#include +#include namespace cpu { @@ -29,93 +30,106 @@ namespace cpu #if defined(USE_MKL) || defined(__APPLE__) - template - using svd_func_def = int (*)(ORDER_TYPE, - char jobz, - int m, int n, - T* in, int ldin, - Tr* s, - T* u, int ldu, - T* vt, int ldvt); - - SVD_FUNC_DEF( gesdd ) - SVD_FUNC(gesdd, float , float , s) - SVD_FUNC(gesdd, double , double, d) - SVD_FUNC(gesdd, cfloat , float , c) - SVD_FUNC(gesdd, cdouble, double, z) +template +using svd_func_def = int (*)(ORDER_TYPE, + char jobz, + int m, int n, + T* in, int ldin, + Tr* s, + T* u, int ldu, + T* vt, int ldvt); + +SVD_FUNC_DEF( gesdd ) +SVD_FUNC(gesdd, float , float , s) +SVD_FUNC(gesdd, double , double, d) +SVD_FUNC(gesdd, cfloat , float , c) +SVD_FUNC(gesdd, cdouble, double, z) #else // Atlas causes memory freeing issues with using gesdd - template - using svd_func_def = int (*)(ORDER_TYPE, - char jobu, char jobvt, - int m, int n, - T* in, int ldin, - Tr* s, - T* u, int ldu, - T* vt, int ldvt, - Tr *superb); - - SVD_FUNC_DEF( gesvd ) - SVD_FUNC(gesvd, float , float , s) - SVD_FUNC(gesvd, double , double, d) - SVD_FUNC(gesvd, cfloat , float , c) - SVD_FUNC(gesvd, cdouble, double, z) +template +using svd_func_def = int (*)(ORDER_TYPE, + char jobu, char jobvt, + int m, int n, + T* in, int ldin, + Tr* s, + T* u, int ldu, + T* vt, int ldvt, + Tr *superb); + +SVD_FUNC_DEF( gesvd ) +SVD_FUNC(gesvd, float , float , s) +SVD_FUNC(gesvd, double , double, d) +SVD_FUNC(gesvd, cfloat , float , c) +SVD_FUNC(gesvd, cdouble, double, z) #endif - template - void svdInPlace(Array &s, Array &u, Array &vt, Array &in) - { +template +void svdInPlace(Array &s, Array &u, Array &vt, Array &in) +{ + s.eval(); + u.eval(); + vt.eval(); + in.eval(); + + auto func = [=] (Array s, Array u, Array vt, Array in) { dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; #if defined(USE_MKL) || defined(__APPLE__) svd_func()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1], - s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]); + s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]); #else std::vector superb(std::min(M, N)); svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1], - s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]); + s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]); #endif - } - - template - void svd(Array &s, Array &u, Array &vt, const Array &in) - { - Array in_copy = copyArray(in); - svdInPlace(s, u, vt, in_copy); - } + }; + getQueue().enqueue(func, s, u, vt, in); +} + +template +void svd(Array &s, Array &u, Array &vt, const Array &in) +{ + Array in_copy = copyArray(in); + svdInPlace(s, u, vt, in_copy); +} + } #else namespace cpu { - template - void svd(Array &s, Array &u, Array &vt, const Array &in) - { - AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); - } - - template - void svdInPlace(Array &s, Array &u, Array &vt, Array &in) - { - AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); - } + +template +void svd(Array &s, Array &u, Array &vt, const Array &in) +{ + AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); +} + +template +void svdInPlace(Array &s, Array &u, Array &vt, Array &in) +{ + AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); +} + } #endif -namespace cpu { +namespace cpu +{ #define INSTANTIATE_SVD(T, Tr) \ template void svd(Array & s, Array & u, Array & vt, const Array &in); \ template void svdInPlace(Array & s, Array & u, Array & vt, Array &in); - INSTANTIATE_SVD(float , float ) - INSTANTIATE_SVD(double , double) - INSTANTIATE_SVD(cfloat , float ) - INSTANTIATE_SVD(cdouble, double) +INSTANTIATE_SVD(float , float ) +INSTANTIATE_SVD(double , double) +INSTANTIATE_SVD(cfloat , float ) +INSTANTIATE_SVD(cdouble, double) + } diff --git a/src/backend/cpu/threads b/src/backend/cpu/threads new file mode 160000 index 0000000000..5e778ce0a7 --- /dev/null +++ b/src/backend/cpu/threads @@ -0,0 +1 @@ +Subproject commit 5e778ce0a7f0f80af9d32ea3569df3dbec834f59 diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp index 77e72afd09..0fe52c6398 100644 --- a/src/backend/cpu/tile.cpp +++ b/src/backend/cpu/tile.cpp @@ -9,69 +9,46 @@ #include #include -#include -#include +#include +#include namespace cpu { - template - Array tile(const Array &in, const af::dim4 &tileDims) - { - const af::dim4 iDims = in.dims(); - af::dim4 oDims = iDims; - oDims *= tileDims; - if(iDims.elements() == 0 || oDims.elements() == 0) { - throw std::runtime_error("Elements are 0"); - } +template +Array tile(const Array &in, const af::dim4 &tileDims) +{ + in.eval(); - Array out = createEmptyArray(oDims); + const af::dim4 iDims = in.dims(); + af::dim4 oDims = iDims; + oDims *= tileDims; - T* outPtr = out.get(); - const T* inPtr = in.get(); + if(iDims.elements() == 0 || oDims.elements() == 0) { + throw std::runtime_error("Elements are 0"); + } - const af::dim4 ist = in.strides(); - const af::dim4 ost = out.strides(); + Array out = createEmptyArray(oDims); - for(dim_t ow = 0; ow < oDims[3]; ow++) { - const dim_t iw = ow % iDims[3]; - const dim_t iW = iw * ist[3]; - const dim_t oW = ow * ost[3]; - for(dim_t oz = 0; oz < oDims[2]; oz++) { - const dim_t iz = oz % iDims[2]; - const dim_t iZW = iW + iz * ist[2]; - const dim_t oZW = oW + oz * ost[2]; - for(dim_t oy = 0; oy < oDims[1]; oy++) { - const dim_t iy = oy % iDims[1]; - const dim_t iYZW = iZW + iy * ist[1]; - const dim_t oYZW = oZW + oy * ost[1]; - for(dim_t ox = 0; ox < oDims[0]; ox++) { - const dim_t ix = ox % iDims[0]; - const dim_t iMem = iYZW + ix; - const dim_t oMem = oYZW + ox; - outPtr[oMem] = inPtr[iMem]; - } - } - } - } + getQueue().enqueue(kernel::tile, out, in); - return out; - } + return out; +} #define INSTANTIATE(T) \ template Array tile(const Array &in, const af::dim4 &tileDims); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) } diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp index 68e8d96eba..3a76fb2f24 100644 --- a/src/backend/cpu/transform.cpp +++ b/src/backend/cpu/transform.cpp @@ -10,138 +10,59 @@ #include #include #include -#include -#include +#include #include "transform_interp.hpp" +#include namespace cpu { - template - void calc_affine_inverse(T *txo, const T *txi) - { - T det = txi[0]*txi[4] - txi[1]*txi[3]; - txo[0] = txi[4] / det; - txo[1] = txi[3] / det; - txo[3] = txi[1] / det; - txo[4] = txi[0] / det; - - txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; - txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; - } - - template - void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse) - { - // The way kernel is structured, it expects an inverse - // transform matrix by default. - // If it is an forward transform, then we need its inverse - if(inverse) { - for(int i = 0; i < 6; i++) - tmat[i] = tmat_ptr[i]; - } else { - calc_affine_inverse(tmat, tmat_ptr); - } - } - - template - void transform_(T *out, const T *in, const float *tf, - const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &tstrides, const bool inverse) - { - dim_t nimages = idims[2]; - // Multiplied in src/backend/transform.cpp - dim_t ntransforms = odims[2] / idims[2]; - - void (*t_fn)(T *, const T *, const float *, const af::dim4 &, - const af::dim4 &, const af::dim4 &, - const dim_t, const dim_t, const dim_t, const dim_t); - - switch(method) { - case AF_INTERP_NEAREST: - t_fn = &transform_n; - break; - case AF_INTERP_BILINEAR: - t_fn = &transform_b; - break; - case AF_INTERP_LOWER: - t_fn = &transform_l; - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } - - - // For each transform channel - for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) { - // Compute inverse if required - const float *tmat_ptr = tf + t_idx * 6; - float tmat[6]; - calc_affine_inverse(tmat, tmat_ptr, inverse); - - // Offset for output pointer - dim_t o_offset = t_idx * nimages * ostrides[2]; - - // Do transform for image - for(int yy = 0; yy < (int)odims[1]; yy++) { - for(int xx = 0; xx < (int)odims[0]; xx++) { - t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy); - } - } - } +template +Array transform(const Array &in, const Array &transform, const af::dim4 &odims, + const af_interp_type method, const bool inverse, const bool perspective) +{ + in.eval(); + transform.eval(); + + Array out = createEmptyArray(odims); + + switch(method) { + case AF_INTERP_NEAREST : + getQueue().enqueue(kernel::transform, out, in, transform, + inverse, perspective); + break; + case AF_INTERP_BILINEAR: + getQueue().enqueue(kernel::transform, out, in, transform, + inverse, perspective); + break; + case AF_INTERP_LOWER : + getQueue().enqueue(kernel::transform, out, in, transform, + inverse, perspective); + break; + default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } - template - Array transform(const Array &in, const Array &transform, const af::dim4 &odims, - const af_interp_type method, const bool inverse) - { - const af::dim4 idims = in.dims(); - - Array out = createEmptyArray(odims); - - switch(method) { - case AF_INTERP_NEAREST: - transform_ - (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); - break; - case AF_INTERP_BILINEAR: - transform_ - (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); - break; - case AF_INTERP_LOWER: - transform_ - (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } + return out; +} - return out; - } +#define INSTANTIATE(T) \ +template Array transform(const Array &in, const Array &transform, \ + const af::dim4 &odims, const af_interp_type method, \ + const bool inverse, const bool perspective); -#define INSTANTIATE(T) \ - template Array transform(const Array &in, const Array &transform, \ - const af::dim4 &odims, const af_interp_type method, \ - const bool inverse); +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) } diff --git a/src/backend/cpu/transform.hpp b/src/backend/cpu/transform.hpp index f9e730b1d4..ad4ebba5c3 100644 --- a/src/backend/cpu/transform.hpp +++ b/src/backend/cpu/transform.hpp @@ -14,5 +14,5 @@ namespace cpu { template Array transform(const Array &in, const Array &tf, const af::dim4 &odims, - const af_interp_type method, const bool inverse); + const af_interp_type method, const bool inverse, const bool perspective); } diff --git a/src/backend/cpu/transform_interp.hpp b/src/backend/cpu/transform_interp.hpp index 5ad47507b2..d8b9ee2a06 100644 --- a/src/backend/cpu/transform_interp.hpp +++ b/src/backend/cpu/transform_interp.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include @@ -27,15 +29,27 @@ namespace cpu void transform_n(T *out, const T *in, const float *tmat, const af::dim4 &idims, const af::dim4 &ostrides, const af::dim4 &istrides, const dim_t nimages, const dim_t o_offset, - const dim_t xx, const dim_t yy) + const dim_t xx, const dim_t yy, const bool perspective) { + dim_t yi = 0, xi = 0; // Compute output index - const dim_t xi = round(xx * tmat[0] - + yy * tmat[1] - + tmat[2]); - const dim_t yi = round(xx * tmat[3] - + yy * tmat[4] - + tmat[5]); + if (perspective) { + const float W = xx * tmat[6] + yy * tmat[7] + tmat[8]; + xi = round((xx * tmat[0] + + yy * tmat[1] + + tmat[2]) / W); + yi = round((xx * tmat[3] + + yy * tmat[4] + + tmat[5]) / W); + } + else { + xi = round(xx * tmat[0] + + yy * tmat[1] + + tmat[2]); + yi = round(xx * tmat[3] + + yy * tmat[4] + + tmat[5]); + } // Compute memory location of indices dim_t loci = (yi * istrides[1] + xi); @@ -62,16 +76,28 @@ namespace cpu void transform_b(T *out, const T *in, const float *tmat, const af::dim4 &idims, const af::dim4 &ostrides, const af::dim4 &istrides, const dim_t nimages, const dim_t o_offset, - const dim_t xx, const dim_t yy) + const dim_t xx, const dim_t yy, const bool perspective) { dim_t loco = (yy * ostrides[1] + xx); // Compute input index - const float xi = xx * tmat[0] - + yy * tmat[1] - + tmat[2]; - const float yi = xx * tmat[3] - + yy * tmat[4] - + tmat[5]; + float xi = 0.0f, yi = 0.0f; + if (perspective) { + const float W = xx * tmat[6] + yy * tmat[7] + tmat[8]; + xi = (xx * tmat[0] + + yy * tmat[1] + + tmat[2]) / W; + yi = (xx * tmat[3] + + yy * tmat[4] + + tmat[5]) / W; + } + else { + xi = xx * tmat[0] + + yy * tmat[1] + + tmat[2]; + yi = xx * tmat[3] + + yy * tmat[4] + + tmat[5]; + } if (xi < -0.0001 || yi < -0.0001 || idims[0] < xi || idims[1] < yi) { for(int i_idx = 0; i_idx < (int)nimages; i_idx++) { @@ -126,15 +152,27 @@ namespace cpu void transform_l(T *out, const T *in, const float *tmat, const af::dim4 &idims, const af::dim4 &ostrides, const af::dim4 &istrides, const dim_t nimages, const dim_t o_offset, - const dim_t xx, const dim_t yy) + const dim_t xx, const dim_t yy, const bool perspective) { // Compute output index - const dim_t xi = floor(xx * tmat[0] - + yy * tmat[1] - + tmat[2]); - const dim_t yi = floor(xx * tmat[3] - + yy * tmat[4] - + tmat[5]); + dim_t xi = 0, yi = 0; + if (perspective) { + const float W = xx * tmat[6] + yy * tmat[7] + tmat[8]; + xi = floor((xx * tmat[0] + + yy * tmat[1] + + tmat[2]) / W); + yi = floor((xx * tmat[3] + + yy * tmat[4] + + tmat[5]) / W); + } + else { + xi = floor(xx * tmat[0] + + yy * tmat[1] + + tmat[2]); + yi = floor(xx * tmat[3] + + yy * tmat[4] + + tmat[5]); + } // Compute memory location of indices dim_t loci = (yi * istrides[1] + xi); diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp index bea0aa0d6f..a6d410757b 100644 --- a/src/backend/cpu/transpose.cpp +++ b/src/backend/cpu/transpose.cpp @@ -12,7 +12,8 @@ #include #include #include - +#include +#include #include #include @@ -21,128 +22,26 @@ using af::dim4; namespace cpu { -static inline unsigned getIdx(const dim4 &strides, - int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i ); -} - -template -T getConjugate(const T &in) -{ - // For non-complex types return same - return in; -} - -template<> -cfloat getConjugate(const cfloat &in) -{ - return std::conj(in); -} - -template<> -cdouble getConjugate(const cdouble &in) -{ - return std::conj(in); -} - -template -void transpose_(T *out, const T *in, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides) -{ - for (dim_t l = 0; l < odims[3]; ++l) { - for (dim_t k = 0; k < odims[2]; ++k) { - // Outermost loop handles batch mode - // if input has no data along third dimension - // this loop runs only once - for (dim_t j = 0; j < odims[1]; ++j) { - for (dim_t i = 0; i < odims[0]; ++i) { - // calculate array indices based on offsets and strides - // the helper getIdx takes care of indices - const dim_t inIdx = getIdx(istrides,j,i,k,l); - const dim_t outIdx = getIdx(ostrides,i,j,k,l); - if(conjugate) - out[outIdx] = getConjugate(in[inIdx]); - else - out[outIdx] = in[inIdx]; - } - } - // outData and inData pointers doesn't need to be - // offset as the getIdx function is taking care - // of the batch parameter - } - } -} - template Array transpose(const Array &in, const bool conjugate) { - const dim4 inDims = in.dims(); - - dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]); + in.eval(); + const dim4 inDims = in.dims(); + const dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]); // create an array with first two dimensions swapped Array out = createEmptyArray(outDims); - // get data pointers for input and output Arrays - T* outData = out.get(); - const T* inData = in.get(); - - if(conjugate) { - transpose_(outData, inData, - out.dims(), in.dims(), out.strides(), in.strides()); - } else { - transpose_(outData, inData, - out.dims(), in.dims(), out.strides(), in.strides()); - } + getQueue().enqueue(kernel::transpose, out, in, conjugate); return out; } -template -void transpose_inplace(T *in, const af::dim4 &idims, const af::dim4 &istrides) -{ - for (dim_t l = 0; l < idims[3]; ++l) { - for (dim_t k = 0; k < idims[2]; ++k) { - // Outermost loop handles batch mode - // if input has no data along third dimension - // this loop runs only once - // - // Run only bottom triangle. std::swap swaps with upper triangle - for (dim_t j = 0; j < idims[1]; ++j) { - for (dim_t i = j + 1; i < idims[0]; ++i) { - // calculate array indices based on offsets and strides - // the helper getIdx takes care of indices - const dim_t iIdx = getIdx(istrides,j,i,k,l); - const dim_t oIdx = getIdx(istrides,i,j,k,l); - if(conjugate) { - in[iIdx] = getConjugate(in[iIdx]); - in[oIdx] = getConjugate(in[oIdx]); - std::swap(in[iIdx], in[oIdx]); - } - else { - std::swap(in[iIdx], in[oIdx]); - } - } - } - } - } -} - template void transpose_inplace(Array &in, const bool conjugate) { - // get data pointers for input and output Arrays - T* inData = in.get(); - - if(conjugate) { - transpose_inplace(inData, in.dims(), in.strides()); - } else { - transpose_inplace(inData, in.dims(), in.strides()); - } + in.eval(); + getQueue().enqueue(kernel::transpose_inplace, in, conjugate); } #define INSTANTIATE(T) \ @@ -162,5 +61,4 @@ INSTANTIATE(uintl ) INSTANTIATE(short) INSTANTIATE(ushort) - } diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp index 6b0f326aad..eaad1b9f86 100644 --- a/src/backend/cpu/triangle.cpp +++ b/src/backend/cpu/triangle.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include namespace cpu { @@ -19,47 +21,14 @@ namespace cpu template void triangle(Array &out, const Array &in) { - T *o = out.get(); - const T *i = in.get(); - - dim4 odm = out.dims(); - - dim4 ost = out.strides(); - dim4 ist = in.strides(); - - for(dim_t ow = 0; ow < odm[3]; ow++) { - const dim_t oW = ow * ost[3]; - const dim_t iW = ow * ist[3]; - - for(dim_t oz = 0; oz < odm[2]; oz++) { - const dim_t oZW = oW + oz * ost[2]; - const dim_t iZW = iW + oz * ist[2]; - - for(dim_t oy = 0; oy < odm[1]; oy++) { - const dim_t oYZW = oZW + oy * ost[1]; - const dim_t iYZW = iZW + oy * ist[1]; - - for(dim_t ox = 0; ox < odm[0]; ox++) { - const dim_t oMem = oYZW + ox; - const dim_t iMem = iYZW + ox; - - bool cond = is_upper ? (oy >= ox) : (oy <= ox); - bool do_unit_diag = (is_unit_diag && ox == oy); - if(cond) { - o[oMem] = do_unit_diag ? scalar(1) : i[iMem]; - } else { - o[oMem] = scalar(0); - } - - } - } - } - } + in.eval(); + getQueue().enqueue(kernel::triangle, out, in); } template Array triangle(const Array &in) { + in.eval(); Array out = createEmptyArray(in.dims()); triangle(out, in); return out; @@ -75,17 +44,17 @@ Array triangle(const Array &in) template Array triangle(const Array &in); \ template Array triangle(const Array &in); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) } diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp index f9c25f9a9e..d19286f496 100644 --- a/src/backend/cpu/unwrap.cpp +++ b/src/backend/cpu/unwrap.cpp @@ -9,116 +9,58 @@ #include #include -#include -#include #include #include +#include +#include namespace cpu { - template - void unwrap_dim(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, - const dim_t px, const dim_t py) - { - dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - - dim_t cOut = w * ostrides[3] + z * ostrides[2]; - dim_t cIn = w * istrides[3] + z * istrides[2]; - const T* iptr = inPtr + cIn; - T* optr_= outPtr + cOut; - - for(dim_t col = 0; col < odims[d]; col++) { - // Offset output ptr - T* optr = optr_ + col * ostrides[d]; - - // Calculate input window index - dim_t winy = (col / nx); - dim_t winx = (col % nx); - - dim_t startx = winx * sx; - dim_t starty = winy * sy; - - dim_t spx = startx - px; - dim_t spy = starty - py; - - // Short cut condition ensuring all values within input dimensions - bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]); +template +Array unwrap(const Array &in, const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column) +{ + in.eval(); - for(dim_t y = 0; y < wy; y++) { - for(dim_t x = 0; x < wx; x++) { - dim_t xpad = spx + x; - dim_t ypad = spy + y; + af::dim4 idims = in.dims(); + dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; + dim_t ny = (idims[1] + 2 * py - wy) / sy + 1; - dim_t oloc = (y * wx + x); - if (d == 0) oloc *= ostrides[1]; + af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]); - if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) { - dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]); - optr[oloc] = iptr[iloc]; - } else { - optr[oloc] = scalar(0.0); - } - } - } - } - } - } + if (!is_column) { + std::swap(odims[0], odims[1]); } - template - Array unwrap(const Array &in, const dim_t wx, const dim_t wy, - const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column) - { - af::dim4 idims = in.dims(); - - dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; - dim_t ny = (idims[1] + 2 * py - wy) / sy + 1; + Array outArray = createEmptyArray(odims); - af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]); - - if (!is_column) { - std::swap(odims[0], odims[1]); - } - - // Create output placeholder - Array outArray = createEmptyArray(odims); - - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - af::dim4 ostrides = outArray.strides(); - af::dim4 istrides = in.strides(); - - if (is_column) { - unwrap_dim(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py); - } else { - unwrap_dim(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py); - } - return outArray; + if (is_column) { + getQueue().enqueue(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); + } else { + getQueue().enqueue(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); } + return outArray; +} + #define INSTANTIATE(T) \ template Array unwrap (const Array &in, const dim_t wx, const dim_t wy, \ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column); - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/utility.hpp b/src/backend/cpu/utility.hpp new file mode 100644 index 0000000000..68cef5a440 --- /dev/null +++ b/src/backend/cpu/utility.hpp @@ -0,0 +1,63 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include "backend.hpp" + +namespace cpu +{ + +static inline +dim_t trimIndex(int const & idx, dim_t const & len) +{ + int ret_val = idx; + int offset = abs(ret_val)%len; + if (ret_val<0) { + ret_val = offset-1; + } else if (ret_val>=(int)len) { + ret_val = len-offset-1; + } + return ret_val; +} + +static inline +dim_t clamp(dim_t a, dim_t mn, dim_t mx) +{ + return (amx ? mx : a)); +} + +static inline +unsigned getIdx(af::dim4 const & strides, int i, int j = 0, int k = 0, int l = 0) +{ + return (l * strides[3] + k * strides[2] + j * strides[1] + i * strides[0]); +} + +template +void gaussian1D(T* out, int const dim, double sigma=0.0) +{ + if(!(sigma>0)) sigma = 0.25*dim; + + T sum = (T)0; + for(int i=0;i #include #include +#include using af::dim4; namespace cpu { - template - Array where(const Array &in) - { - const dim_t *dims = in.dims().get(); - const dim_t *strides = in.strides().get(); - static const T zero = scalar(0); - const T *iptr = in.get(); - uint *out_vec = memAlloc(in.elements()); +template +Array where(const Array &in) +{ + in.eval(); + getQueue().sync(); + + const dim_t *dims = in.dims().get(); + const dim_t *strides = in.strides().get(); + static const T zero = scalar(0); + + const T *iptr = in.get(); + uint *out_vec = memAlloc(in.elements()); - dim_t count = 0; - dim_t idx = 0; - for (dim_t w = 0; w < dims[3]; w++) { - uint offw = w * strides[3]; + dim_t count = 0; + dim_t idx = 0; + for (dim_t w = 0; w < dims[3]; w++) { + uint offw = w * strides[3]; - for (dim_t z = 0; z < dims[2]; z++) { - uint offz = offw + z * strides[2]; + for (dim_t z = 0; z < dims[2]; z++) { + uint offz = offw + z * strides[2]; - for (dim_t y = 0; y < dims[1]; y++) { - uint offy = y * strides[1] + offz; + for (dim_t y = 0; y < dims[1]; y++) { + uint offy = y * strides[1] + offz; - for (dim_t x = 0; x < dims[0]; x++) { + for (dim_t x = 0; x < dims[0]; x++) { - T val = iptr[offy + x]; - if (val != zero) { - out_vec[count] = idx; - count++; - } - idx++; + T val = iptr[offy + x]; + if (val != zero) { + out_vec[count] = idx; + count++; } + idx++; } } } - - Array out = createDeviceDataArray(dim4(count), out_vec); - return out; } + Array out = createDeviceDataArray(dim4(count), out_vec); + return out; +} + #define INSTANTIATE(T) \ template Array where(const Array &in); \ - INSTANTIATE(float ) - INSTANTIATE(cfloat ) - INSTANTIATE(double ) - INSTANTIATE(cdouble) - INSTANTIATE(char ) - INSTANTIATE(int ) - INSTANTIATE(uint ) - INSTANTIATE(intl ) - INSTANTIATE(uintl ) - INSTANTIATE(uchar ) - INSTANTIATE(short ) - INSTANTIATE(ushort ) +INSTANTIATE(float ) +INSTANTIATE(cfloat ) +INSTANTIATE(double ) +INSTANTIATE(cdouble) +INSTANTIATE(char ) +INSTANTIATE(int ) +INSTANTIATE(uint ) +INSTANTIATE(intl ) +INSTANTIATE(uintl ) +INSTANTIATE(uchar ) +INSTANTIATE(short ) +INSTANTIATE(ushort ) } diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp index a04a6f5250..8e0f6fe2f7 100644 --- a/src/backend/cpu/wrap.cpp +++ b/src/backend/cpu/wrap.cpp @@ -9,95 +9,37 @@ #include #include -#include -#include #include #include +#include +#include namespace cpu { - template - void wrap_dim(T *outPtr, const T *inPtr, - const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t wx, const dim_t wy, - const dim_t sx, const dim_t sy, - const dim_t px, const dim_t py) - { - dim_t nx = (odims[0] + 2 * px - wx) / sx + 1; - - for(dim_t w = 0; w < idims[3]; w++) { - for(dim_t z = 0; z < idims[2]; z++) { - - dim_t cIn = w * istrides[3] + z * istrides[2]; - dim_t cOut = w * ostrides[3] + z * ostrides[2]; - const T* iptr_ = inPtr + cIn; - T* optr= outPtr + cOut; - - for(dim_t col = 0; col < idims[d]; col++) { - // Offset output ptr - const T* iptr = iptr_ + col * istrides[d]; - - // Calculate input window index - dim_t winy = (col / nx); - dim_t winx = (col % nx); - - dim_t startx = winx * sx; - dim_t starty = winy * sy; - - dim_t spx = startx - px; - dim_t spy = starty - py; - - // Short cut condition ensuring all values within input dimensions - bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]); - - for(dim_t y = 0; y < wy; y++) { - for(dim_t x = 0; x < wx; x++) { - dim_t xpad = spx + x; - dim_t ypad = spy + y; +template +Array wrap(const Array &in, + const dim_t ox, const dim_t oy, + const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, + const dim_t px, const dim_t py, + const bool is_column) +{ + af::dim4 idims = in.dims(); + af::dim4 odims(ox, oy, idims[2], idims[3]); - dim_t iloc = (y * wx + x); - if (d == 0) iloc *= istrides[1]; + Array out = createValueArray(odims, scalar(0)); + out.eval(); + in.eval(); - if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) { - dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]); - // FIXME: When using threads, atomize this - optr[oloc] += iptr[iloc]; - } - } - } - } - } - } + if (is_column) { + getQueue().enqueue(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); + } else { + getQueue().enqueue(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); } - template - Array wrap(const Array &in, - const dim_t ox, const dim_t oy, - const dim_t wx, const dim_t wy, - const dim_t sx, const dim_t sy, - const dim_t px, const dim_t py, - const bool is_column) - { - af::dim4 idims = in.dims(); - af::dim4 odims(ox, oy, idims[2], idims[3]); - Array out = createValueArray(odims, scalar(0)); - - const T *inPtr = in.get(); - T *outPtr = out.get(); - - af::dim4 istrides = in.strides(); - af::dim4 ostrides = out.strides(); - - if (is_column) { - wrap_dim(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py); - } else { - wrap_dim(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py); - } - - return out; - } + return out; +} #define INSTANTIATE(T) \ @@ -108,17 +50,17 @@ namespace cpu const dim_t px, const dim_t py, \ const bool is_column); +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) } diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 275ea13a99..786574129b 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -16,6 +16,7 @@ #include #include #include +#include using af::dim4; @@ -29,17 +30,17 @@ namespace cuda template Array::Array(af::dim4 dims) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(memAlloc(dims.elements()), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) {} template Array::Array(af::dim4 dims, const T * const in_data, bool is_device, bool copy_device) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(((is_device & !copy_device) ? (T *)in_data : memAlloc(dims.elements())), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { #if __cplusplus > 199711L static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); @@ -57,34 +58,51 @@ namespace cuda } template - Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) : - info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits::af_type), + Array::Array(const Array& parent, const dim4 &dims, const dim_t &offset_, const dim4 &strides) : + info(parent.getDevId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), data(parent.getData()), data_dims(parent.getDataDims()), node(), - offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), ready(true), owner(false) { } template Array::Array(Param &tmp) : - info(getActiveDeviceId(), af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3]), - af::dim4(0, 0, 0, 0), - af::dim4(tmp.strides[0], tmp.strides[1], tmp.strides[2], tmp.strides[3]), - (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), + af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3]), + 0, + af::dim4(tmp.strides[0], tmp.strides[1], tmp.strides[2], tmp.strides[3]), + (af_dtype)dtype_traits::af_type), data(tmp.ptr, memFree), data_dims(af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3])), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { } template Array::Array(af::dim4 dims, JIT::Node_ptr n) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(), data_dims(dims), - node(n), offset(0), ready(false), owner(true) + node(n), ready(false), owner(true) { } + template + Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, + const T * const in_data, bool is_device) : + info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), + data(is_device ? (T*)in_data : memAlloc(info.total()), memFree), + data_dims(dims), + node(), + ready(true), + owner(true) + { + if (!is_device) { + cudaStream_t stream = getStream(getActiveDeviceId()); + CUDA_CHECK(cudaMemcpyAsync(data.get(), in_data, info.total() * sizeof(T), + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + } template void Array::eval() @@ -148,9 +166,9 @@ namespace cuda n->getInfo(length, buf_count, bytes); n->resetFlags(); - if (length > MAX_JIT_LEN || - buf_count >= MAX_BUFFERS || - bytes >= MAX_BYTES) { + if (length > getMaxJitSize() || + buf_count >= getMaxBuffers() || + bytes >= getMaxBytes()) { out.eval(); } @@ -197,18 +215,23 @@ namespace cuda dim4 dDims = parent.getDataDims(); dim4 pDims = parent.dims(); - dim4 dims = toDims (index, pDims); - dim4 offset = toOffset(index, dDims); - dim4 stride = toStride (index, dDims); + dim4 dims = toDims (index, pDims); + dim4 strides = toStride (index, dDims); + + // Find total offsets after indexing + dim4 offsets = toOffset(index, pDims); + dim4 parent_strides = parent.strides(); + dim_t offset = parent.getOffset(); + for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i]; - Array out = Array(parent, dims, offset, stride); + Array out = Array(parent, dims, offset, strides); if (!copy) return out; - if (stride[0] != 1 || - stride[1] < 0 || - stride[2] < 0 || - stride[3] < 0) { + if (strides[0] != 1 || + strides[1] < 0 || + strides[2] < 0 || + strides[3] < 0) { out = copyArray(out); } @@ -228,23 +251,17 @@ namespace cuda delete A; } - template - void evalArray(const Array &A) - { - A.eval(); - } - template void writeHostDataArray(Array &arr, const T * const data, const size_t bytes) { if (!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } T *ptr = arr.get(); - CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data, bytes, cudaMemcpyHostToDevice, + CUDA_CHECK(cudaMemcpyAsync(ptr, data, bytes, cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); @@ -256,12 +273,12 @@ namespace cuda writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) { if (!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } T *ptr = arr.get(); - CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data, + CUDA_CHECK(cudaMemcpyAsync(ptr, data, bytes, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); @@ -279,11 +296,14 @@ namespace cuda const std::vector &index, \ bool copy); \ template void destroyArray (Array *A); \ - template void evalArray (const Array &A); \ template Array createNodeArray (const dim4 &size, JIT::Node_ptr node); \ + template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \ + const T * const in_data, \ + bool is_device); \ template Array::Array(af::dim4 dims, const T * const in_data, \ bool is_device, bool copy_device); \ template Array::~Array (); \ + template Node_ptr Array::getNode() const; \ template void Array::eval(); \ template void Array::eval() const; \ template void writeHostDataArray (Array &arr, const T * const data, const size_t bytes); \ diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index 598fdfd35e..7678754bc3 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -78,9 +78,6 @@ namespace cuda const std::vector &index, bool copy=true); - template - void evalArray(const Array &A); - // Creates a new Array object on the heap and returns a reference to it. template void destroyArray(Array *A); @@ -89,10 +86,16 @@ namespace cuda void *getDevicePtr(const Array& arr) { T *ptr = arr.device(); - memPop(ptr); + memLock(ptr); return (void *)ptr; } + template + void *getRawPtr(const Array& arr) + { + return (void *)(arr.get(false)); + } + template class Array { @@ -101,17 +104,20 @@ namespace cuda af::dim4 data_dims; JIT::Node_ptr node; - dim_t offset; bool ready; bool owner; Array(af::dim4 dims); + explicit Array(af::dim4 dims, const T * const in_data, bool is_device = false, bool copy_device = false); - Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); + Array(const Array& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride); Array(Param &tmp); Array(af::dim4 dims, JIT::Node_ptr n); public: + Array(af::dim4 dims, af::dim4 strides, dim_t offset, + const T * const in_data, bool is_device = false); + void resetInfo(const af::dim4& dims) { info.resetInfo(dims); } void resetDims(const af::dim4& dims) { info.resetDims(dims); } void modDims(const af::dim4 &newDims) { info.modDims(newDims); } @@ -122,7 +128,6 @@ namespace cuda RET_TYPE NAME() const { return info.NAME(); } INFO_FUNC(const af_dtype& ,getType) - INFO_FUNC(const af::dim4& ,offsets) INFO_FUNC(const af::dim4& ,strides) INFO_FUNC(size_t ,elements) INFO_FUNC(size_t ,ndims) @@ -159,7 +164,7 @@ namespace cuda void eval(); void eval() const; - dim_t getOffset() const { return offset; } + dim_t getOffset() const { return info.getOffset(); } shared_ptr getData() const { return data; } dim4 getDataDims() const @@ -169,6 +174,11 @@ namespace cuda return isOwner() ? dims() : data_dims; } + void setDataDims(const dim4 &new_dims) + { + data_dims = new_dims; + } + T* device() { if (!isOwner() || data.use_count() > 1) { @@ -192,7 +202,7 @@ namespace cuda const T* get(bool withOffset = true) const { if (!isReady()) eval(); - return data.get() + (withOffset ? offset : 0); + return data.get() + (withOffset ? getOffset() : 0); } int useCount() const @@ -234,8 +244,8 @@ namespace cuda bool copy); friend void destroyArray(Array *arr); - friend void evalArray(const Array &arr); friend void *getDevicePtr(const Array& arr); + friend void *getRawPtr(const Array& arr); }; } diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index bb8fca013c..ae0690dba2 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -18,6 +18,7 @@ IF( CUDA_COMPUTE_20 OR CUDA_COMPUTE_30 OR CUDA_COMPUTE_32 OR CUDA_COMPUTE_35 + OR CUDA_COMPUTE_37 OR CUDA_COMPUTE_50 OR CUDA_COMPUTE_52 OR CUDA_COMPUTE_53 @@ -44,11 +45,12 @@ MACRO(SET_COMPUTE VERSION) SET(CUDA_GENERATE_CODE_${VERSION} "-gencode arch=compute_${VERSION},code=sm_${VERSION}") SET(CUDA_GENERATE_CODE ${CUDA_GENERATE_CODE} ${CUDA_GENERATE_CODE_${VERSION}}) LIST(APPEND COMPUTE_VERSIONS "${VERSION}") + ADD_DEFINITIONS(-DCUDA_COMPUTE_${VERSION}) MESSAGE(STATUS "Setting Compute ${VERSION} to ON") ENDMACRO(SET_COMPUTE) # Iterate over compute versions. Create variables and enable computes if needed -FOREACH(VER 20 30 32 35 50 52 53) +FOREACH(VER 20 30 32 35 37 50 52 53) OPTION(CUDA_COMPUTE_${VER} "CUDA Compute Capability ${VER}" OFF) MARK_AS_ADVANCED(CUDA_COMPUTE_${VER}) IF(${CUDA_COMPUTE_${VER}}) @@ -57,8 +59,9 @@ FOREACH(VER 20 30 32 35 50 52 53) ENDFOREACH() IF(UNIX) + # Forcing STRICT ANSI should resolve a bunch of issues that NVIDIA seems to face with GCC compilers. + ADD_DEFINITIONS(-D__STRICT_ANSI__) SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fvisibility=hidden) - REMOVE_DEFINITIONS(-std=c++0x) IF(${WITH_COVERAGE}) SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fprofile-arcs -Xcompiler -ftest-coverage -Xlinker -fprofile-arcs -Xlinker -ftest-coverage") ENDIF(${WITH_COVERAGE}) @@ -68,50 +71,70 @@ ENDIF() ADD_DEFINITIONS(-DAF_CUDA) -IF(${CUDA_VERSION_MAJOR} LESS 7) +# CMake 3.2 Adds CUDA_cusolver_LIBRARY variable to FindCUDA +# Older version, use FIND_LIBRARY +IF(CMAKE_VERSION VERSION_LESS 3.2) + IF(${CUDA_cusolver_LIBRARY} MATCHES " ") + UNSET(CUDA_cusolver_LIBRARY CACHE) # When going from higher version to lower version + ENDIF() + FIND_LIBRARY ( + CUDA_cusolver_LIBRARY + NAMES "cusolver" + PATHS ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES "lib64" "lib/x64" "lib" + DOC "CUDA cusolver Library" + NO_DEFAULT_PATH + ) +ENDIF(CMAKE_VERSION VERSION_LESS 3.2) + +IF(${CUDA_VERSION_MAJOR} LESS 7 AND CUDA_cusolver_LIBRARY) + UNSET(CUDA_cusolver_LIBRARY CACHE) # Failsafe when going from higher version to lower version +ENDIF() + +IF(CUDA_cusolver_LIBRARY) + MESSAGE(STATUS "CUDA cusolver library available in CUDA Version ${CUDA_VERSION_STRING}") + ADD_DEFINITIONS(-DWITH_CUDA_LINEAR_ALGEBRA) +ELSE(CUDA_cusolver_LIBRARY) # Use CPU Lapack as fallback? - OPTION(CUDA_LAPACK_CPU_FALLBACK "Use CPU LAPACK as fallback for CUDA LAPACK when CUDA is 6.5 or older" OFF) + OPTION(CUDA_LAPACK_CPU_FALLBACK "Use CPU LAPACK as fallback for CUDA LAPACK when cusolver is not available" OFF) MARK_AS_ADVANCED(CUDA_LAPACK_CPU_FALLBACK) IF(${CUDA_LAPACK_CPU_FALLBACK}) ## Try to use CPU side lapack IF(APPLE) - FIND_PACKAGE(LAPACK) + FIND_PACKAGE(LAPACKE QUIET) # For finding MKL + IF(NOT LAPACK_FOUND) + # UNSET THE VARIABLES FROM LAPACKE + UNSET(LAPACKE_LIB CACHE) + UNSET(LAPACK_LIB CACHE) + UNSET(LAPACKE_INCLUDES CACHE) + UNSET(LAPACKE_ROOT_DIR CACHE) + FIND_PACKAGE(LAPACK) + ENDIF() ELSE(APPLE) # Linux and Windows FIND_PACKAGE(LAPACKE) ENDIF(APPLE) IF(NOT LAPACK_FOUND) - MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.") + MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. Linear Algebra will not be available.") ELSE(NOT LAPACK_FOUND) - MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. But CPU LAPACK libraries are available. Will fallback to using host side code.") + MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. But CPU LAPACK libraries are available. Will fallback to using host side code.") ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA) - IF(USE_CUDA_MKL) - MESSAGE("Using MKL") + IF(USE_CUDA_MKL) # Manual MKL Setup + MESSAGE("CUDA LAPACK CPU Fallback Using MKL") ADD_DEFINITIONS(-DUSE_MKL) + ELSE(USE_CUDA_MKL) + IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS + MESSAGE("CUDA LAPACK CPU Fallback Using MKL RT") + ADD_DEFINITIONS(-DUSE_MKL) + ENDIF() ENDIF() ENDIF() ELSE() - MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.") + MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. Linear Algebra will not be available.") ENDIF() - IF(CMAKE_VERSION VERSION_LESS 3.2) - SET(CUDA_cusolver_LIBRARY) - MARK_AS_ADVANCED(CUDA_cusolver_LIBRARY) - ENDIF(CMAKE_VERSION VERSION_LESS 3.2) -ELSE(${CUDA_VERSION_MAJOR} LESS 7) - MESSAGE(STATUS "CUDA cusolver library available in CUDA Version ${CUDA_VERSION_STRING}") - ADD_DEFINITIONS(-DWITH_CUDA_LINEAR_ALGEBRA) - IF(CMAKE_VERSION VERSION_LESS 3.2) - FIND_LIBRARY( - CUDA_cusolver_LIBRARY - NAMES "cusolver" - PATHS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES "lib64" "lib/x64" "lib" - DOC "CUDA cusolver Library" - NO_DEFAULT_PATH - ) - ENDIF(CMAKE_VERSION VERSION_LESS 3.2) -ENDIF(${CUDA_VERSION_MAJOR} LESS 7) + UNSET(CUDA_cusolver_LIBRARY CACHE) # Failsafe when going from higher version to lower version +ENDIF(CUDA_cusolver_LIBRARY) INCLUDE_DIRECTORIES( ${CMAKE_INCLUDE_PATH} @@ -308,7 +331,6 @@ ADD_DEPENDENCIES(afcuda ${ptx_targets}) TARGET_LINK_LIBRARIES(afcuda PRIVATE ${CUDA_CUBLAS_LIBRARIES} PRIVATE ${CUDA_LIBRARIES} - PRIVATE ${CUDA_cusolver_LIBRARY} PRIVATE ${FreeImage_LIBS} PRIVATE ${CUDA_CUFFT_LIBRARIES} PRIVATE ${CUDA_NVVM_LIBRARIES} @@ -318,8 +340,10 @@ IF(FORGE_FOUND) TARGET_LINK_LIBRARIES(afcuda PRIVATE ${FORGE_LIBRARIES}) ENDIF() -IF(CUDA_LAPACK_CPU_FALLBACK) - TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES}) +IF(CUDA_cusolver_LIBRARY) + TARGET_LINK_LIBRARIES(afcuda PRIVATE ${CUDA_cusolver_LIBRARY}) +ELSEIF(CUDA_LAPACK_CPU_FALLBACK) + TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES}) ENDIF() SET_TARGET_PROPERTIES(afcuda PROPERTIES diff --git a/src/backend/cuda/JIT/BinaryNode.hpp b/src/backend/cuda/JIT/BinaryNode.hpp index 2a2abb0610..f916d85576 100644 --- a/src/backend/cuda/JIT/BinaryNode.hpp +++ b/src/backend/cuda/JIT/BinaryNode.hpp @@ -126,11 +126,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_set_arg = false; + resetCommonFlags(); m_lhs->resetFlags(); m_rhs->resetFlags(); } diff --git a/src/backend/cuda/JIT/BufferNode.hpp b/src/backend/cuda/JIT/BufferNode.hpp index efe32f8b72..342e1ed0b7 100644 --- a/src/backend/cuda/JIT/BufferNode.hpp +++ b/src/backend/cuda/JIT/BufferNode.hpp @@ -178,12 +178,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_gen_name = false; - m_set_arg = false; + resetCommonFlags(); } void setArgs(std::vector &args, bool is_linear) diff --git a/src/backend/cuda/JIT/Node.hpp b/src/backend/cuda/JIT/Node.hpp index e30a1cf63b..00fed9fda7 100644 --- a/src/backend/cuda/JIT/Node.hpp +++ b/src/backend/cuda/JIT/Node.hpp @@ -37,6 +37,19 @@ namespace JIT bool m_set_arg; bool m_gen_name; + protected: + + void resetCommonFlags() + { + m_set_id = false; + m_gen_func = false; + m_gen_param = false; + m_gen_offset = false; + m_set_arg = false; + m_gen_name = false; + } + + public: Node(const char *type_str, const char *name_str) @@ -62,7 +75,11 @@ namespace JIT virtual void setArgs(std::vector &args, bool is_linear) { m_set_arg = true; } virtual bool isLinear(dim_t dims[4]) { return true; } - virtual void resetFlags() {} + virtual void resetFlags() + { + resetCommonFlags(); + } + virtual void getInfo(unsigned &len, unsigned &buf_count, unsigned &bytes) { len = 0; diff --git a/src/backend/cuda/JIT/ScalarNode.hpp b/src/backend/cuda/JIT/ScalarNode.hpp index 288af4dcdb..34f316d34b 100644 --- a/src/backend/cuda/JIT/ScalarNode.hpp +++ b/src/backend/cuda/JIT/ScalarNode.hpp @@ -87,12 +87,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_gen_name = false; - m_set_arg = false; + resetCommonFlags(); } void setArgs(std::vector &args, bool is_linear) diff --git a/src/backend/cuda/JIT/UnaryNode.hpp b/src/backend/cuda/JIT/UnaryNode.hpp index caa573104b..94ee96ece7 100644 --- a/src/backend/cuda/JIT/UnaryNode.hpp +++ b/src/backend/cuda/JIT/UnaryNode.hpp @@ -118,11 +118,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_set_arg = false; + resetCommonFlags(); m_child->resetFlags(); } diff --git a/src/backend/cuda/JIT/numeric.cu b/src/backend/cuda/JIT/numeric.cu index 8253db6d22..2bcb15a112 100644 --- a/src/backend/cuda/JIT/numeric.cu +++ b/src/backend/cuda/JIT/numeric.cu @@ -119,6 +119,19 @@ MATH_CAST(lgamma, intl , float) MATH_CAST(lgamma, ushort, float) MATH_CAST(lgamma, short , float) +MATH_NOOP(noop, float) +MATH_NOOP(noop, double) +MATH_NOOP(noop, cfloat) +MATH_NOOP(noop, cdouble) +MATH_NOOP(noop, int) +MATH_NOOP(noop, uint) +MATH_NOOP(noop, char) +MATH_NOOP(noop, uchar) +MATH_NOOP(noop, uintl) +MATH_NOOP(noop, intl) +MATH_NOOP(noop, ushort) +MATH_NOOP(noop, short) + __device__ float ___abs(cfloat a) { return cuCabsf(a); } __device__ double ___abs(cdouble a) { return cuCabs(a); } diff --git a/src/backend/cuda/blas.cpp b/src/backend/cuda/blas.cpp index 85f48da750..9d3b9ca7b7 100644 --- a/src/backend/cuda/blas.cpp +++ b/src/backend/cuda/blas.cpp @@ -18,6 +18,9 @@ #include #include #include +#include +#include +#include namespace cuda { @@ -197,40 +200,15 @@ Array matmul(const Array &lhs, const Array &rhs, } -template -Array dot_(const Array &lhs, const Array &rhs, - af_mat_prop optLhs, af_mat_prop optRhs) -{ - int N = lhs.dims()[0]; - - T out; - - CUBLAS_CHECK((dot_func()( - getHandle(), - N, - lhs.get(), lhs.strides()[0], - rhs.get(), rhs.strides()[0], - &out))); - - if(both_conjugate) - return createValueArray(af::dim4(1), conj(out)); - else - return createValueArray(af::dim4(1), out); -} - template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { - if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - return dot_(lhs, rhs, optLhs, optRhs); - } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - return dot_(lhs, rhs, optLhs, optRhs); - } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - return dot_(rhs, lhs, optRhs, optLhs); - } else { - return dot_(lhs, rhs, optLhs, optRhs); - } + const Array lhs_ = (optLhs == AF_MAT_NONE ? lhs : conj(lhs)); + const Array rhs_ = (optRhs == AF_MAT_NONE ? rhs : conj(rhs)); + + const Array temp = arithOp(lhs_, rhs_, lhs_.dims()); + return reduce(temp, 0, false, 0); } template diff --git a/src/backend/cuda/complex.hpp b/src/backend/cuda/complex.hpp index 82304b9a22..b7de74a7de 100644 --- a/src/backend/cuda/complex.hpp +++ b/src/backend/cuda/complex.hpp @@ -17,25 +17,25 @@ namespace cuda { - template static const std::string cplx_name() { return "@___noop"; } - template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } - template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } + template static const std::string cplx_name() { return cuMangledName("___noop"); } + template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } + template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } - template static const std::string real_name() { return "@___noop"; } + template static const std::string real_name() { return cuMangledName("___noop"); } template<> STATIC_ const std::string real_name() { return cuMangledName("___real"); } template<> STATIC_ const std::string real_name() { return cuMangledName("___real"); } - template static const std::string imag_name() { return "@___noop"; } + template static const std::string imag_name() { return cuMangledName("___noop"); } template<> STATIC_ const std::string imag_name() { return cuMangledName("___imag"); } template<> STATIC_ const std::string imag_name() { return cuMangledName("___imag"); } - template static const std::string abs_name() { return "@___noop"; } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } + template static const std::string abs_name() { return cuMangledName("___noop"); } + template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } + template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } + template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } + template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template static const std::string conj_name() { return "@___noop"; } + template static const std::string conj_name() { return cuMangledName("___noop"); } template<> STATIC_ const std::string conj_name() { return cuMangledName("___conj"); } template<> STATIC_ const std::string conj_name() { return cuMangledName("___conj"); } diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu index 90f9970239..35e5c83178 100644 --- a/src/backend/cuda/copy.cu +++ b/src/backend/cuda/copy.cu @@ -23,12 +23,12 @@ namespace cuda void copyData(T *data, const Array &A) { // FIXME: Merge this with copyArray - evalArray(A); + A.eval(); Array out = A; const T *ptr = NULL; - if (A.isOwner() || // No offsets, No strides + if (A.isLinear() || // No offsets, No strides A.ndims() == 1 // Simple offset, no strides. ) { @@ -71,7 +71,6 @@ namespace cuda ARG_ASSERT(1, (in.ndims() == dims.ndims())); Array ret = createEmptyArray(dims); kernel::copy(ret, in, in.ndims(), default_value, factor); - CUDA_CHECK(cudaDeviceSynchronize()); return ret; } diff --git a/src/backend/cuda/cpu_lapack/lapack_helper.hpp b/src/backend/cuda/cpu_lapack/lapack_helper.hpp index 58265871c2..b85a80b10c 100644 --- a/src/backend/cuda/cpu_lapack/lapack_helper.hpp +++ b/src/backend/cuda/cpu_lapack/lapack_helper.hpp @@ -19,17 +19,17 @@ #define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR #define LAPACK_NAME(fn) LAPACKE_##fn -#ifdef __APPLE__ -#include -#include -#undef AF_LAPACK_COL_MAJOR -#define AF_LAPACK_COL_MAJOR 0 -#else #ifdef USE_MKL -#include -#else // NETLIB LAPACKE -#include -#endif + #include +#else + #ifdef __APPLE__ + #include + #include + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 + #else // NETLIB LAPACKE + #include + #endif #endif #endif diff --git a/src/backend/cuda/debug_cuda.hpp b/src/backend/cuda/debug_cuda.hpp index 084d12f804..f5424950dc 100644 --- a/src/backend/cuda/debug_cuda.hpp +++ b/src/backend/cuda/debug_cuda.hpp @@ -51,8 +51,12 @@ #else -#define POST_LAUNCH_CHECK() do { \ - CUDA_CHECK(cudaPeekAtLastError()); \ - } while(0) \ +#define POST_LAUNCH_CHECK() do { \ + if(cuda::synchronize_calls()) { \ + CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); \ + } else { \ + CUDA_CHECK(cudaPeekAtLastError()); \ + } \ + } while(0) \ #endif diff --git a/src/backend/cuda/err_cuda.hpp b/src/backend/cuda/err_cuda.hpp index a975fb5336..dd87bdfc2b 100644 --- a/src/backend/cuda/err_cuda.hpp +++ b/src/backend/cuda/err_cuda.hpp @@ -17,22 +17,23 @@ __AF_FILENAME__, __LINE__, "CUDA"); \ } while(0) -#define CUDA_CHECK(fn) do { \ - cudaError_t _cuda_error = fn; \ - if (_cuda_error != cudaSuccess) { \ - char cuda_err_msg[1024]; \ - snprintf(cuda_err_msg, \ - sizeof(cuda_err_msg), \ - "CUDA Error (%d): %s\n", \ - (int)(_cuda_error), \ - cudaGetErrorString( \ - cudaGetLastError())); \ - \ - if (_cuda_error == cudaErrorMemoryAllocation) { \ - AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM); \ - } else { \ - AF_ERROR(cuda_err_msg, \ - AF_ERR_INTERNAL); \ - } \ - } \ +#define CUDA_CHECK(fn) do { \ + cudaError_t _cuda_error = fn; \ + if (_cuda_error != cudaSuccess) { \ + char cuda_err_msg[1024]; \ + snprintf(cuda_err_msg, \ + sizeof(cuda_err_msg), \ + "CUDA Error (%d): %s\n", \ + (int)(_cuda_error), \ + cudaGetErrorString( \ + cudaGetLastError())); \ + \ + if (_cuda_error == cudaErrorMemoryAllocation) { \ + AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM); \ + } else if (_cuda_error == cudaErrorDevicesUnavailable) {\ + AF_ERROR(cuda_err_msg, AF_ERR_DRIVER); \ + } else { \ + AF_ERROR(cuda_err_msg, AF_ERR_INTERNAL); \ + } \ + } \ } while(0) diff --git a/src/backend/cuda/interopManager.cu b/src/backend/cuda/interopManager.cu index b492a5ee1d..a6e2fcf9bd 100644 --- a/src/backend/cuda/interopManager.cu +++ b/src/backend/cuda/interopManager.cu @@ -14,6 +14,7 @@ #include #include +#include #include namespace cuda @@ -36,10 +37,10 @@ InteropManager::~InteropManager() } } catch (AfError &ex) { - const char* perr = getenv("AF_PRINT_ERRORS"); - - if(perr && perr[0] != '0') { - fprintf(stderr, "%s\n", ex.what()); + std::string perr = getEnvVar("AF_PRINT_ERRORS"); + if(!perr.empty()) { + if(perr != "0") + fprintf(stderr, "%s\n", ex.what()); } } } diff --git a/src/backend/cuda/kernel/fast_pyramid.hpp b/src/backend/cuda/kernel/fast_pyramid.hpp index 61a9c7ac32..d2e5903788 100644 --- a/src/backend/cuda/kernel/fast_pyramid.hpp +++ b/src/backend/cuda/kernel/fast_pyramid.hpp @@ -65,7 +65,11 @@ void fast_pyramid(std::vector& feat_pyr, lvl_best[max_levels-1] = max_feat - feat_sum; // Hold multi-scale image pyramids - img_pyr.reserve(max_levels); + static const dim4 dims0; + static const CParam emptyCParam(NULL, dims0.get(), dims0.get()); + // Need to do this as CParam does not have a default constructor + // And resize needs a default constructor or default value prior to C++11 + img_pyr.resize(max_levels, emptyCParam); // Create multi-scale image pyramid for (unsigned i = 0; i < max_levels; i++) { diff --git a/src/backend/cuda/kernel/random.hpp b/src/backend/cuda/kernel/random.hpp index 4d960ae46b..96cf098c03 100644 --- a/src/backend/cuda/kernel/random.hpp +++ b/src/backend/cuda/kernel/random.hpp @@ -49,8 +49,18 @@ namespace kernel ~curandStateManager() { - //if(_state != NULL) memFree((char*)_state); - if(_state != NULL) CUDA_CHECK(cudaFree(_state)); + try { + if (_state != NULL) { + cudaError_t err = cudaFree(_state); + if (err != cudaErrorCudartUnloading) { + CUDA_CHECK(err); + } + } + } catch (AfError err) { + if (err.getError() != AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable + throw err; + } + } } unsigned long long getSeed() const @@ -69,7 +79,6 @@ namespace kernel if(_state) return _state; - //_state = (curandState_t*)memAlloc(BLOCKS * THREADS * sizeof(curandState_t)); CUDA_CHECK(cudaMalloc((void **)&_state, BLOCKS * THREADS * sizeof(curandState_t))); this->resetSeed(); return _state; diff --git a/src/backend/cuda/kernel/rotate.hpp b/src/backend/cuda/kernel/rotate.hpp index d63f010c3b..3cea7f2698 100644 --- a/src/backend/cuda/kernel/rotate.hpp +++ b/src/backend/cuda/kernel/rotate.hpp @@ -60,11 +60,11 @@ namespace cuda switch(method) { case AF_INTERP_NEAREST: - transform_n(optr, out, iptr, in, t.tmat, xx, yy, limages); break; + transform_n(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break; case AF_INTERP_BILINEAR: - transform_b(optr, out, iptr, in, t.tmat, xx, yy, limages); break; + transform_b(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break; case AF_INTERP_LOWER: - transform_l(optr, out, iptr, in, t.tmat, xx, yy, limages); break; + transform_l(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break; default: break; } } diff --git a/src/backend/cuda/kernel/select.hpp b/src/backend/cuda/kernel/select.hpp index ab5bf2da7b..ea242e45dd 100644 --- a/src/backend/cuda/kernel/select.hpp +++ b/src/backend/cuda/kernel/select.hpp @@ -41,7 +41,7 @@ namespace cuda const int idw = blockIdx.y / blk_y; const int blockIdx_x = blockIdx.x - idz * blk_x; - const int blockIdx_y = blockIdx.y - idz * blk_y; + const int blockIdx_y = blockIdx.y - idw * blk_y; const int idx = blockIdx_x * blockDim.x + threadIdx.x; const int idy = blockIdx_y * blockDim.y + threadIdx.y; @@ -110,7 +110,7 @@ namespace cuda const int idw = blockIdx.y / blk_y; const int blockIdx_x = blockIdx.x - idz * blk_x; - const int blockIdx_y = blockIdx.y - idz * blk_y; + const int blockIdx_y = blockIdx.y - idw * blk_y; const int idx = blockIdx_x * blockDim.x + threadIdx.x; const int idy = blockIdx_y * blockDim.y + threadIdx.y; diff --git a/src/backend/cuda/kernel/transform.hpp b/src/backend/cuda/kernel/transform.hpp index 07be0a35b3..599e62cf9d 100644 --- a/src/backend/cuda/kernel/transform.hpp +++ b/src/backend/cuda/kernel/transform.hpp @@ -24,21 +24,42 @@ namespace cuda // Used for batching images static const unsigned TI = 4; - __constant__ float c_tmat[6 * 256]; + __constant__ float c_tmat[9 * 256]; template __host__ __device__ - void calc_affine_inverse(T *txo, const T *txi) + void calc_transf_inverse(T *txo, const T *txi, const bool perspective) { - T det = txi[0]*txi[4] - txi[1]*txi[3]; - - txo[0] = txi[4] / det; - txo[1] = txi[3] / det; - txo[3] = txi[1] / det; - txo[4] = txi[0] / det; - - txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; - txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; + if (perspective) { + txo[0] = txi[4]*txi[8] - txi[5]*txi[7]; + txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]); + txo[2] = txi[1]*txi[5] - txi[2]*txi[4]; + + txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]); + txo[4] = txi[0]*txi[8] - txi[2]*txi[6]; + txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]); + + txo[6] = txi[3]*txi[7] - txi[4]*txi[6]; + txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]); + txo[8] = txi[0]*txi[4] - txi[1]*txi[3]; + + T det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6]; + + txo[0] /= det; txo[1] /= det; txo[2] /= det; + txo[3] /= det; txo[4] /= det; txo[5] /= det; + txo[6] /= det; txo[7] /= det; txo[8] /= det; + } + else { + T det = txi[0]*txi[4] - txi[1]*txi[3]; + + txo[0] = txi[4] / det; + txo[1] = txi[3] / det; + txo[3] = txi[1] / det; + txo[4] = txi[0] / det; + + txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; + txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; + } } /////////////////////////////////////////////////////////////////////////// @@ -47,7 +68,8 @@ namespace cuda template __global__ static void transform_kernel(Param out, CParam in, const int nimages, - const int ntransforms, const int blocksXPerImage) + const int ntransforms, const int blocksXPerImage, + const int transf_len, const bool perspective) { // Compute which image set const int setId = blockIdx.x / blocksXPerImage; @@ -77,30 +99,32 @@ namespace cuda const T *iptr = in.ptr + setId * nimages * in.strides[2]; // Transform is in constant memory. - const float *tmat_ptr = c_tmat + t_idx * 6; - float tmat[6]; + const float *tmat_ptr = c_tmat + t_idx * transf_len; + float* tmat = new float[transf_len]; // We expect a inverse transform matrix by default // If it is an forward transform, then we need its inverse if(inverse) { - #pragma unroll - for(int i = 0; i < 6; i++) + #pragma unroll 3 + for(int i = 0; i < transf_len; i++) tmat[i] = tmat_ptr[i]; } else { - calc_affine_inverse(tmat, tmat_ptr); + calc_transf_inverse(tmat, tmat_ptr, perspective); } if (xido >= out.dims[0] && yido >= out.dims[1]) return; switch(method) { case AF_INTERP_NEAREST: - transform_n(optr, out, iptr, in, tmat, xido, yido, limages); break; + transform_n(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break; case AF_INTERP_BILINEAR: - transform_b(optr, out, iptr, in, tmat, xido, yido, limages); break; + transform_b(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break; case AF_INTERP_LOWER: - transform_l(optr, out, iptr, in, tmat, xido, yido, limages); break; + transform_l(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break; default: break; } + + delete[] tmat; } /////////////////////////////////////////////////////////////////////////// @@ -108,15 +132,18 @@ namespace cuda /////////////////////////////////////////////////////////////////////////// template void transform(Param out, CParam in, CParam tf, - const bool inverse) + const bool inverse, const bool perspective) { int nimages = in.dims[2]; // Multiplied in src/backend/transform.cpp const int ntransforms = out.dims[2] / in.dims[2]; + + const int transf_len = (perspective) ? 9 : 6; + // Copy transform to constant memory. - CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0, - cudaMemcpyDeviceToDevice, + CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * transf_len * sizeof(float), + 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); dim3 threads(TX, TY, 1); @@ -133,10 +160,12 @@ namespace cuda if(inverse) { CUDA_LAUNCH((transform_kernel), blocks, threads, - out, in, nimages, ntransforms, blocksXPerImage); + out, in, nimages, ntransforms, blocksXPerImage, + transf_len, perspective); } else { CUDA_LAUNCH((transform_kernel), blocks, threads, - out, in, nimages, ntransforms, blocksXPerImage); + out, in, nimages, ntransforms, blocksXPerImage, + transf_len, perspective); } POST_LAUNCH_CHECK(); } diff --git a/src/backend/cuda/kernel/transform_interp.hpp b/src/backend/cuda/kernel/transform_interp.hpp index 5a88fc4d76..1554b8ec62 100644 --- a/src/backend/cuda/kernel/transform_interp.hpp +++ b/src/backend/cuda/kernel/transform_interp.hpp @@ -42,15 +42,28 @@ namespace cuda template __device__ void transform_n(T *optr, Param out, const T *iptr, CParam in, const float *tmat, - const int xido, const int yido, const int nimages) + const int xido, const int yido, const int nimages, + const bool perspective) { // Compute input index - int xidi = round(xido * tmat[0] + int xidi = 0, yidi = 0; + if (perspective) { + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = round((xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W); + yidi = round((xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W); + } + else { + xidi = round(xido * tmat[0] + yido * tmat[1] + tmat[2]); - int yidi = round(xido * tmat[3] + yidi = round(xido * tmat[3] + yido * tmat[4] + tmat[5]); + } // Makes scale give same output as resize // But fails rotate tests @@ -76,17 +89,30 @@ namespace cuda template __device__ void transform_b(T *optr, Param out, const T *iptr, CParam in, const float *tmat, - const int xido, const int yido, const int nimages) + const int xido, const int yido, const int nimages, + const bool perspective) { const int loco = (yido * out.strides[1] + xido); // Compute input index - const float xidi = xido * tmat[0] - + yido * tmat[1] - + tmat[2]; - const float yidi = xido * tmat[3] - + yido * tmat[4] - + tmat[5]; + float xidi = 0.0f, yidi = 0.0f; + if (perspective) { + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = (xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W; + yidi = (xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W; + } + else { + xidi = xido * tmat[0] + + yido * tmat[1] + + tmat[2]; + yidi = xido * tmat[3] + + yido * tmat[4] + + tmat[5]; + } if (xidi < -0.0001 || yidi < -0.0001 || in.dims[0] < xidi || in.dims[1] < yidi) { for(int i = 0; i < nimages; i++) { @@ -133,15 +159,28 @@ namespace cuda template __device__ void transform_l(T *optr, Param out, const T *iptr, CParam in, const float *tmat, - const int xido, const int yido, const int nimages) + const int xido, const int yido, const int nimages, + const bool perspective) { // Compute input index - int xidi = floor(xido * tmat[0] + int xidi = 0, yidi = 0; + if (perspective) { + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = floor((xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W); + yidi = floor((xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W); + } + else { + xidi = floor(xido * tmat[0] + yido * tmat[1] + tmat[2]); - int yidi = floor(xido * tmat[3] + yidi = floor(xido * tmat[3] + yido * tmat[4] + tmat[5]); + } // Makes scale give same output as resize // But fails rotate tests diff --git a/src/backend/cuda/lu.cu b/src/backend/cuda/lu.cu index 2a45d4b9f5..ce0b545a84 100644 --- a/src/backend/cuda/lu.cu +++ b/src/backend/cuda/lu.cu @@ -156,6 +156,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) return pivot; } +bool isLAPACKAvailable() +{ + return true; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -186,6 +191,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) return cpu::lu_inplace(in, convert_pivot); } +bool isLAPACKAvailable() +{ + return true; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -213,6 +223,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) AF_ERR_NOT_CONFIGURED); } +bool isLAPACKAvailable() +{ + return false; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); diff --git a/src/backend/cuda/lu.hpp b/src/backend/cuda/lu.hpp index 0753129d6b..acf9dbaad7 100644 --- a/src/backend/cuda/lu.hpp +++ b/src/backend/cuda/lu.hpp @@ -17,4 +17,6 @@ namespace cuda template Array lu_inplace(Array &in, const bool convert_pivot = true); + + bool isLAPACKAvailable(); } diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 9b3d731b4b..51eb507320 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -12,367 +12,233 @@ #include #include #include +#include #include +#include +#include +#include #include #include #include +#include -namespace cuda -{ - static size_t memory_resolution = 1024; //1KB - - void setMemStepSize(size_t step_bytes) - { - memory_resolution = step_bytes; - } - - size_t getMemStepSize(void) - { - return memory_resolution; - } - - template - static void cudaFreeWrapper(T *ptr) - { - cudaError_t err = cudaFree(ptr); - if (err != cudaErrorCudartUnloading) // see issue #167 - CUDA_CHECK(err); - } - - template - static void pinnedFreeWrapper(T *ptr) - { - cudaError_t err = cudaFreeHost(ptr); - if (err != cudaErrorCudartUnloading) // see issue #167 - CUDA_CHECK(err); - } - -#ifdef AF_CUDA_MEM_DEBUG - template - T* memAlloc(const size_t &elements) - { - T* ptr = NULL; - CUDA_CHECK(cudaMalloc(&ptr, elements * sizeof(T))); - return ptr; - } - - template - void memFree(T *ptr) - { - cudaFreeWrapper(ptr); // Free it because we are not sure what the size is - } - - template - void memPop(const T *ptr) - { - return; - } - - template - void memPush(const T *ptr) - { - return; - } - - template - T* pinnedAlloc(const size_t &elements) - { - T* ptr = NULL; - CUDA_CHECK(cudaMallocHost((void **)(&ptr), elements * sizeof(T))); - return (T*)ptr; - } - - template - void pinnedFree(T *ptr) - { - pinnedFreeWrapper(ptr); // Free it because we are not sure what the size is - } - - void garbageCollect() - { - } +#ifndef AF_MEM_DEBUG +#define AF_MEM_DEBUG 0 +#endif - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) - { - } +#ifndef AF_CUDA_MEM_DEBUG +#define AF_CUDA_MEM_DEBUG 0 +#endif -#else +namespace cuda +{ - // Manager Class - // Dummy used to call garbage collection at the end of the program - class Manager +class MemoryManager : public common::MemoryManager +{ + int getActiveDeviceId(); + size_t getMaxMemorySize(int id); +public: + MemoryManager(); + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); + ~MemoryManager() { - public: - static bool initialized; - Manager() - { - initialized = true; - } - - ~Manager() - { - // Destructors should not through exceptions + common::lock_guard_t lock(this->memory_mutex); + for (int n = 0; n < getDeviceCount(); n++) { try { - for(int i = 0; i < getDeviceCount(); i++) { - setDevice(i); - garbageCollect(); - } - pinnedGarbageCollect(); - - } catch (AfError &ex) { - - const char* perr = getenv("AF_PRINT_ERRORS"); - if(perr && perr[0] != '0') { - fprintf(stderr, "%s\n", ex.what()); - } + cuda::setDevice(n); + this->garbageCollect(); + } catch(AfError err) { + continue; // Do not throw any errors while shutting down } } - }; - - bool Manager::initialized = false; - - static void managerInit() - { - if(Manager::initialized == false) - static Manager pm = Manager(); } +}; - typedef struct - { - bool is_free; - bool is_unlinked; - size_t bytes; - } mem_info; - - static size_t used_bytes[DeviceManager::MAX_DEVICES] = {0}; - static size_t used_buffers[DeviceManager::MAX_DEVICES] = {0}; - static size_t total_bytes[DeviceManager::MAX_DEVICES] = {0}; - typedef std::map mem_t; - typedef mem_t::iterator mem_iter; - - mem_t memory_maps[DeviceManager::MAX_DEVICES]; - - void garbageCollect() +// CUDA Pinned Memory does not depend on device +// So we pass 1 as numDevices to the constructor so that it creates 1 vector +// of memory_info +// When allocating and freeing, it doesn't really matter which device is active +class MemoryManagerPinned : public common::MemoryManager +{ + int getActiveDeviceId(); + size_t getMaxMemorySize(int id); +public: + MemoryManagerPinned(); + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); + ~MemoryManagerPinned() { - int n = getActiveDeviceId(); - - for(mem_iter iter = memory_maps[n].begin(); - iter != memory_maps[n].end(); ++iter) { - - if ((iter->second).is_free) { - - if (!(iter->second).is_unlinked) { - cudaFreeWrapper(iter->first); - total_bytes[n] -= iter->second.bytes; - } - } - } - - mem_iter memory_curr = memory_maps[n].begin(); - mem_iter memory_end = memory_maps[n].end(); - - while(memory_curr != memory_end) { - if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) { - memory_maps[n].erase(memory_curr++); - } else { - ++memory_curr; - } - } + common::lock_guard_t lock(this->memory_mutex); + this->garbageCollect(); } +}; - template - T* memAlloc(const size_t &elements) - { - managerInit(); - int n = getActiveDeviceId(); - T* ptr = NULL; - size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution; - - if (elements > 0) { - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes[n] >= MAX_BYTES) { - garbageCollect(); - } - - for(mem_iter iter = memory_maps[n].begin(); - iter != memory_maps[n].end(); ++iter) { - - mem_info info = iter->second; +int MemoryManager::getActiveDeviceId() +{ + return cuda::getActiveDeviceId(); +} - if ( info.is_free && - !info.is_unlinked && - info.bytes == alloc_bytes) { +size_t MemoryManager::getMaxMemorySize(int id) +{ + return cuda::getDeviceMemorySize(id); +} - iter->second.is_free = false; - used_bytes[n] += alloc_bytes; - used_buffers[n]++; - return (T *)iter->first; - } - } +MemoryManager::MemoryManager() : + common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) +{ + this->setMaxMemorySize(); +} - // Perform garbage collection if memory can not be allocated - if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) { - garbageCollect(); - CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes)); - } +void *MemoryManager::nativeAlloc(const size_t bytes) +{ + void *ptr = NULL; + CUDA_CHECK(cudaMalloc(&ptr, bytes)); + return ptr; +} - mem_info info = {false, false, alloc_bytes}; - memory_maps[n][ptr] = info; - used_bytes[n] += alloc_bytes; - used_buffers[n]++; - total_bytes[n] += alloc_bytes; - } - return ptr; +void MemoryManager::nativeFree(void *ptr) +{ + cudaError_t err = cudaFree(ptr); + if (err != cudaErrorCudartUnloading) { + CUDA_CHECK(err); } +} - template - void memFree(T *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find((void *)ptr); - - if (iter != memory_maps[n].end()) { +static MemoryManager &getMemoryManager() +{ + static MemoryManager instance; + return instance; +} - iter->second.is_free = true; - if ((iter->second).is_unlinked) return; +int MemoryManagerPinned::getActiveDeviceId() +{ + return 0; // pinned uses a single vector +} - used_bytes[n] -= iter->second.bytes; - used_buffers[n]--; +size_t MemoryManagerPinned::getMaxMemorySize(int id) +{ + return cuda::getHostMemorySize(); +} - } else { - cudaFreeWrapper(ptr); // Free it because we are not sure what the size is - } - } +MemoryManagerPinned::MemoryManagerPinned() : + common::MemoryManager(1, common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) +{ + this->setMaxMemorySize(); +} - template - void memPop(const T *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find((void *)ptr); +void *MemoryManagerPinned::nativeAlloc(const size_t bytes) +{ + void *ptr; + CUDA_CHECK(cudaMallocHost(&ptr, bytes)); + return ptr; +} - if (iter != memory_maps[n].end()) { - iter->second.is_unlinked = true; - } else { +void MemoryManagerPinned::nativeFree(void *ptr) +{ + cudaError_t err = cudaFreeHost(ptr); + if (err != cudaErrorCudartUnloading) { + CUDA_CHECK(err); + } +} - mem_info info = { false, - true, - 100 }; //This number is not relevant +static MemoryManagerPinned &getMemoryManagerPinned() +{ + static MemoryManagerPinned instance; + return instance; +} - memory_maps[n][(void *)ptr] = info; - } - } +void setMemStepSize(size_t step_bytes) +{ + getMemoryManager().setMemStepSize(step_bytes); +} - template - void memPush(const T *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find((void *)ptr); - if (iter != memory_maps[n].end()) { - iter->second.is_unlinked = false; - } - } +size_t getMemStepSize(void) +{ + return getMemoryManager().getMemStepSize(); +} - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) - { - int n = getActiveDeviceId(); - if (alloc_bytes ) *alloc_bytes = total_bytes[n]; - if (alloc_buffers ) *alloc_buffers = memory_maps[n].size(); - if (lock_bytes ) *lock_bytes = used_bytes[n]; - if (lock_buffers ) *lock_buffers = used_buffers[n]; - } +size_t getMaxBytes() +{ + return getMemoryManager().getMaxBytes(); +} - ////////////////////////////////////////////////////////////////////////////// - mem_t pinned_maps; - static size_t pinned_used_bytes = 0; +unsigned getMaxBuffers() +{ + return getMemoryManager().getMaxBuffers(); +} - void pinnedGarbageCollect() - { - for(mem_iter iter = pinned_maps.begin(); iter != pinned_maps.end(); ++iter) { - if ((iter->second).is_free) { - pinnedFreeWrapper(iter->first); - } - } +void garbageCollect() +{ + getMemoryManager().garbageCollect(); +} - mem_iter memory_curr = pinned_maps.begin(); - mem_iter memory_end = pinned_maps.end(); +void printMemInfo(const char *msg, const int device) +{ + getMemoryManager().printInfo(msg, device); +} - while(memory_curr != memory_end) { - if (memory_curr->second.is_free) { - pinned_maps.erase(memory_curr++); - } else { - ++memory_curr; - } - } - } +template +T* memAlloc(const size_t &elements) +{ + return (T *)getMemoryManager().alloc(elements * sizeof(T), false); +} - template - T* pinnedAlloc(const size_t &elements) - { - managerInit(); - T* ptr = NULL; - // Allocate the higher megabyte. Overhead of creating pinned memory is - // more so we want more resuable memory. - size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576; - - if (elements > 0) { - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) { - pinnedGarbageCollect(); - } +void* memAllocUser(const size_t &bytes) +{ + return getMemoryManager().alloc(bytes, true); +} +template +void memFree(T *ptr) +{ + return getMemoryManager().unlock((void *)ptr, false); +} - for(mem_iter iter = pinned_maps.begin(); - iter != pinned_maps.end(); ++iter) { +void memFreeUser(void *ptr) +{ + getMemoryManager().unlock((void *)ptr, true); +} - mem_info info = iter->second; - if (info.is_free && info.bytes == alloc_bytes) { - iter->second.is_free = false; - pinned_used_bytes += alloc_bytes; - return (T *)iter->first; - } - } +void memLock(const void *ptr) +{ + getMemoryManager().userLock((void *)ptr); +} - // Perform garbage collection if memory can not be allocated - if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) { - pinnedGarbageCollect(); - CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes)); - } +void memUnlock(const void *ptr) +{ + getMemoryManager().userUnlock((void *)ptr); +} - mem_info info = {false, false, alloc_bytes}; - pinned_maps[ptr] = info; - pinned_used_bytes += alloc_bytes; - } - return (T*)ptr; - } +void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers, + lock_bytes, lock_buffers); +} - template - void pinnedFree(T *ptr) - { - mem_iter iter = pinned_maps.find((void *)ptr); +template +T* pinnedAlloc(const size_t &elements) +{ + return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T), false); +} - if (iter != pinned_maps.end()) { - iter->second.is_free = true; - pinned_used_bytes -= iter->second.bytes; - } else { - pinnedFreeWrapper(ptr); // Free it because we are not sure what the size is - } - } +template +void pinnedFree(T* ptr) +{ + return getMemoryManagerPinned().unlock((void *)ptr, false); +} -#endif +bool checkMemoryLimit() +{ + return getMemoryManager().checkMemoryLimit(); +} -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ INSTANTIATE(float) INSTANTIATE(cfloat) diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index 2e5fef2593..80478c13dc 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -9,24 +9,36 @@ #pragma once #include + namespace cuda { template T* memAlloc(const size_t &elements); + void *memAllocUser(const size_t &bytes); + + // Need these as 2 separate function and not a default argument + // This is because it is used as the deleter in shared pointer + // which cannot support default arguments template void memFree(T* ptr); - template void memPop(const T *ptr); - template void memPush(const T *ptr); + void memFreeUser(void* ptr); + + void memLock(const void *ptr); + void memUnlock(const void *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 100; - static const unsigned MAX_BYTES = (1 << 30); + size_t getMaxBytes(); + unsigned getMaxBuffers(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); void garbageCollect(); void pinnedGarbageCollect(); + void printMemInfo(const char *msg, const int device); + void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); + + bool checkMemoryLimit(); } diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 38e8a04d16..10cfdc886c 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,8 @@ #include #include #include +#include +#include using namespace std; @@ -60,13 +63,13 @@ static inline int compute2cores(int major, int minor) return 0; } -// compare two cards based on (in order): -// 1. flops (theoretical) -// 2. total memory - +// Return true if greater, false if lesser. +// if equal, it continues to next comparison #define COMPARE(a,b,f) do { \ - return ((a)->f >= (b)->f); \ - } while (0); + if ((a)->f > (b)->f) return true; \ + if ((a)->f < (b)->f) return false; \ + break; \ + } while (0) static inline bool card_compare_compute(const cudaDevice_t &l, const cudaDevice_t &r) @@ -79,7 +82,7 @@ static inline bool card_compare_compute(const cudaDevice_t &l, const cudaDevice_ COMPARE(lc, rc, flops); COMPARE(lc, rc, prop.totalGlobalMem); COMPARE(lc, rc, nativeId); - return 0; + return false; } static inline bool card_compare_flops(const cudaDevice_t &l, const cudaDevice_t &r) @@ -92,7 +95,7 @@ static inline bool card_compare_flops(const cudaDevice_t &l, const cudaDevice_t COMPARE(lc, rc, prop.major); COMPARE(lc, rc, prop.minor); COMPARE(lc, rc, nativeId); - return 0; + return false; } static inline bool card_compare_mem(const cudaDevice_t &l, const cudaDevice_t &r) @@ -105,7 +108,7 @@ static inline bool card_compare_mem(const cudaDevice_t &l, const cudaDevice_t &r COMPARE(lc, rc, prop.major); COMPARE(lc, rc, prop.minor); COMPARE(lc, rc, nativeId); - return 0; + return false; } static inline bool card_compare_num(const cudaDevice_t &l, const cudaDevice_t &r) @@ -114,7 +117,7 @@ static inline bool card_compare_num(const cudaDevice_t &l, const cudaDevice_t &r const cudaDevice_t *rc = &r; COMPARE(lc, rc, nativeId); - return 0; + return false; } static const std::string get_system(void) @@ -147,18 +150,6 @@ int getBackend() return AF_BACKEND_CUDA; } -string getInfo() -{ - ostringstream info; - info << "ArrayFire v" << AF_VERSION - << " (CUDA, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; - info << getPlatformInfo(); - for (int i = 0; i < getDeviceCount(); ++i) { - info << getDeviceInfo(i); - } - return info.str(); -} - string getDeviceInfo(int device) { cudaDeviceProp dev = getDeviceProp(device); @@ -183,6 +174,18 @@ string getDeviceInfo(int device) return info; } +string getDeviceInfo() +{ + ostringstream info; + info << "ArrayFire v" << AF_VERSION + << " (CUDA, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; + info << getPlatformInfo(); + for (int i = 0; i < getDeviceCount(); ++i) { + info << getDeviceInfo(i); + } + return info.str(); +} + string getPlatformInfo() { string driverVersion = getDriverVersion(); @@ -211,7 +214,7 @@ void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) cudaDeviceProp dev = getDeviceProp(getActiveDeviceId()); // Name - snprintf(d_name, 32, "%s", dev.name); + snprintf(d_name, 64, "%s", dev.name); //Platform std::string cudaRuntime = getCUDARuntimeVersion(); @@ -222,7 +225,7 @@ void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) snprintf(d_compute, 10, "%d.%d", dev.major, dev.minor); // Sanitize input - for (int i = 0; i < 31; i++) { + for (int i = 0; i < 63; i++) { if (d_name[i] == ' ') { if (d_name[i + 1] == 0 || d_name[i + 1] == ' ') d_name[i] = 0; else d_name[i] = '_'; @@ -258,6 +261,23 @@ string getCUDARuntimeVersion() } +unsigned getMaxJitSize() +{ + const int MAX_JIT_LEN = 20; + + static int length = 0; + if (length == 0) { + std::string env_var = getEnvVar("AF_CUDA_MAX_JIT_LEN"); + if (!env_var.empty()) { + length = std::stoi(env_var); + } else { + length = MAX_JIT_LEN; + } + } + + return length; +} + int getDeviceCount() { return DeviceManager::getInstance().nDevices; @@ -302,6 +322,16 @@ cudaStream_t getStream(int device) return str; } +size_t getDeviceMemorySize(int device) +{ + return getDeviceProp(device).totalGlobalMem; +} + +size_t getHostMemorySize() +{ + return common::getHostMemorySize(); +} + int setDevice(int device) { return DeviceManager::getInstance().setActiveDevice(device); @@ -347,8 +377,8 @@ DeviceManager::DeviceManager() for(int i = 0; i < (int)MAX_DEVICES; i++) streams[i] = (cudaStream_t)0; - const char* deviceENV = getenv("AF_CUDA_DEFAULT_DEVICE"); - if(!deviceENV) { + std::string deviceENV = getEnvVar("AF_CUDA_DEFAULT_DEVICE"); + if(deviceENV.empty()) { setActiveDevice(0, cuDevices[0].nativeId); } else { stringstream s(deviceENV); @@ -368,36 +398,81 @@ void DeviceManager::sortDevices(sort_mode mode) { switch(mode) { case memory : - sort(cuDevices.begin(), cuDevices.end(), card_compare_mem); + std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_mem); break; case flops : - sort(cuDevices.begin(), cuDevices.end(), card_compare_flops); + std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_flops); break; case compute : - sort(cuDevices.begin(), cuDevices.end(), card_compare_compute); + std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_compute); break; case none : default : - sort(cuDevices.begin(), cuDevices.end(), card_compare_num); + std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_num); break; } } int DeviceManager::setActiveDevice(int device, int nId) { - if(device > (int)cuDevices.size()) { - return -1; - } else { - int old = activeDev; - if(nId == -1) nId = getDeviceNativeId(device); - CUDA_CHECK(cudaSetDevice(nId)); - activeDev = device; + static bool first = true; - if(!streams[device]) { - CUDA_CHECK(cudaStreamCreate(&streams[device])); - } + int numDevices = cuDevices.size(); + + if(device > numDevices) return -1; + + int old = activeDev; + if(nId == -1) nId = getDeviceNativeId(device); + CUDA_CHECK(cudaSetDevice(nId)); + cudaError_t err = cudaSuccess; + if(!streams[device]) + err = cudaStreamCreate(&streams[device]); + + activeDev = device; + + if (err == cudaSuccess) return old; + + // Comes when user sets device + // If success, return. Else throw error + if (!first) { + CUDA_CHECK(err); return old; } + + // Comes only when first is true. Set it to false + first = false; + + while(true) { + // Check for errors other than DevicesUnavailable + // If success, return. Else throw error + // If DevicesUnavailable, try other devices (while loop below) + if (err != cudaErrorDevicesUnavailable) { + CUDA_CHECK(err); + activeDev = device; + return old; + } + cudaGetLastError(); // Reset error stack +#ifndef NDEBUG + printf("Warning: Device %d is unavailable. Incrementing to next device \n", device); +#endif + + // Comes here is the device is in exclusive mode or + // otherwise fails streamCreate with this error. + // All other errors will error out + device++; + if (device >= numDevices) break; + + // Can't call getNativeId here as it will cause an infinite loop with the constructor + nId = cuDevices[device].nativeId; + + CUDA_CHECK(cudaSetDevice(nId)); + err = cudaStreamCreate(&streams[device]); + } + + // If all devices fail with DevicesUnavailable, then throw this error + CUDA_CHECK(err); + + return old; } void sync(int device) @@ -408,6 +483,11 @@ void sync(int device) setDevice(currDevice); } +bool synchronize_calls() { + static bool sync = getEnvVar("AF_SYNCHRONOUS_CALLS") == "1"; + return sync; +} + } af_err afcu_get_stream(cudaStream_t* stream, int id) diff --git a/src/backend/cuda/platform.hpp b/src/backend/cuda/platform.hpp index 7b649686dc..3fcc67ea5b 100644 --- a/src/backend/cuda/platform.hpp +++ b/src/backend/cuda/platform.hpp @@ -22,8 +22,7 @@ namespace cuda int getBackend(); -std::string getInfo(); - +std::string getDeviceInfo(); std::string getDeviceInfo(int device); std::string getPlatformInfo(); @@ -32,12 +31,12 @@ std::string getDriverVersion(); std::string getCUDARuntimeVersion(); -std::string getInfo(); - bool isDoubleSupported(int device); void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); +unsigned getMaxJitSize(); + int getDeviceCount(); int getActiveDeviceId(); @@ -46,10 +45,17 @@ int getDeviceNativeId(int device); cudaStream_t getStream(int device); +size_t getDeviceMemorySize(int device); + +size_t getHostMemorySize(); + int setDevice(int device); void sync(int device); +// Returns true if the AF_SYNCHRONIZE_CALLS environment variable is set to 1 +bool synchronize_calls(); + cudaDeviceProp getDeviceProp(int device); struct cudaDevice_t { @@ -73,7 +79,7 @@ class DeviceManager friend std::string getCUDARuntimeVersion(); - friend std::string getInfo(); + friend std::string getDeviceInfo(); friend int getDeviceCount(); diff --git a/src/backend/cuda/set.cu b/src/backend/cuda/set.cu index 63501d3f2a..4629b8b3dc 100644 --- a/src/backend/cuda/set.cu +++ b/src/backend/cuda/set.cu @@ -32,7 +32,7 @@ namespace cuda Array out = copyArray(in); thrust::device_ptr out_ptr = thrust::device_pointer_cast(out.get()); - thrust::device_ptr out_ptr_end = out_ptr + out.dims()[0]; + thrust::device_ptr out_ptr_end = out_ptr + out.elements(); if(!is_sorted) THRUST_SELECT(thrust::sort, out_ptr, out_ptr_end); thrust::device_ptr out_ptr_last; @@ -55,14 +55,14 @@ namespace cuda unique_second = setUnique(second, false); } - dim_t out_size = unique_first.dims()[0] + unique_second.dims()[0]; + dim_t out_size = unique_first.elements() + unique_second.elements(); Array out = createEmptyArray(dim4(out_size)); thrust::device_ptr first_ptr = thrust::device_pointer_cast(unique_first.get()); - thrust::device_ptr first_ptr_end = first_ptr + unique_first.dims()[0]; + thrust::device_ptr first_ptr_end = first_ptr + unique_first.elements(); thrust::device_ptr second_ptr = thrust::device_pointer_cast(unique_second.get()); - thrust::device_ptr second_ptr_end = second_ptr + unique_second.dims()[0]; + thrust::device_ptr second_ptr_end = second_ptr + unique_second.elements(); thrust::device_ptr out_ptr = thrust::device_pointer_cast(out.get()); @@ -87,14 +87,14 @@ namespace cuda unique_second = setUnique(second, false); } - dim_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]); + dim_t out_size = std::max(unique_first.elements(), unique_second.elements()); Array out = createEmptyArray(dim4(out_size)); thrust::device_ptr first_ptr = thrust::device_pointer_cast(unique_first.get()); - thrust::device_ptr first_ptr_end = first_ptr + unique_first.dims()[0]; + thrust::device_ptr first_ptr_end = first_ptr + unique_first.elements(); thrust::device_ptr second_ptr = thrust::device_pointer_cast(unique_second.get()); - thrust::device_ptr second_ptr_end = second_ptr + unique_second.dims()[0]; + thrust::device_ptr second_ptr_end = second_ptr + unique_second.elements(); thrust::device_ptr out_ptr = thrust::device_pointer_cast(out.get()); diff --git a/src/backend/cuda/sift.cu b/src/backend/cuda/sift.cu index f3d36d7dfb..ad668af924 100644 --- a/src/backend/cuda/sift.cu +++ b/src/backend/cuda/sift.cu @@ -15,7 +15,7 @@ #include #include -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT #include #endif @@ -34,7 +34,7 @@ unsigned sift(Array& x, Array& y, Array& score, const float img_scale, const float feature_ratio, const bool compute_GLOH) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT const dim4 dims = in.dims(); unsigned nfeat_out; diff --git a/src/backend/cuda/transform.cu b/src/backend/cuda/transform.cu index 853617c0a4..07c312353c 100644 --- a/src/backend/cuda/transform.cu +++ b/src/backend/cuda/transform.cu @@ -16,7 +16,7 @@ namespace cuda { template Array transform(const Array &in, const Array &transform, const af::dim4 &odims, - const af_interp_type method, const bool inverse) + const af_interp_type method, const bool inverse, const bool perspective) { const af::dim4 idims = in.dims(); @@ -24,13 +24,13 @@ namespace cuda switch(method) { case AF_INTERP_NEAREST: - kernel::transform (out, in, transform, inverse); + kernel::transform (out, in, transform, inverse, perspective); break; case AF_INTERP_BILINEAR: - kernel::transform(out, in, transform, inverse); + kernel::transform(out, in, transform, inverse, perspective); break; case AF_INTERP_LOWER: - kernel::transform (out, in, transform, inverse); + kernel::transform (out, in, transform, inverse, perspective); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); @@ -43,7 +43,7 @@ namespace cuda #define INSTANTIATE(T) \ template Array transform(const Array &in, const Array &transform, \ const af::dim4 &odims, const af_interp_type method, \ - const bool inverse); + const bool inverse, const bool perspective); INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/backend/cuda/transform.hpp b/src/backend/cuda/transform.hpp index eb3d71d097..316953d614 100644 --- a/src/backend/cuda/transform.hpp +++ b/src/backend/cuda/transform.hpp @@ -14,5 +14,6 @@ namespace cuda { template Array transform(const Array &in, const Array &tf, const af::dim4 &odims, - const af_interp_type method, const bool inverse); + const af_interp_type method, const bool inverse, + const bool perspective); } diff --git a/src/api/c/dispatch.cpp b/src/backend/dispatch.cpp similarity index 100% rename from src/api/c/dispatch.cpp rename to src/backend/dispatch.cpp diff --git a/src/api/c/dispatch.hpp b/src/backend/dispatch.hpp similarity index 100% rename from src/api/c/dispatch.hpp rename to src/backend/dispatch.hpp diff --git a/src/backend/host_memory.cpp b/src/backend/host_memory.cpp new file mode 100644 index 0000000000..9b4f1e5f54 --- /dev/null +++ b/src/backend/host_memory.cpp @@ -0,0 +1,113 @@ +/* + * Author: David Robert Nadeau + * Site: http://NadeauSoftware.com/ + * License: Creative Commons Attribution 3.0 Unported License + * http://creativecommons.org/licenses/by/3.0/deed.en_US + * Source: http://nadeausoftware.com/sites/NadeauSoftware.com/files/getMemorySize.c + */ + +#include "host_memory.hpp" + +#if defined(_WIN32) +#include + +#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) +#include +#include +#include + +#if defined(BSD) +#include +#endif + +#else +#define NOMEMORYSIZE +#endif + +namespace common +{ + +#ifdef NOMEMORYSIZE +size_t getHostMemorySize() +{ + return 0L; // Can't detect +} + +#else + +/** + * Returns the size of physical memory (RAM) in bytes. + */ +size_t getHostMemorySize() +{ +#if defined(_WIN32) && (defined(__CYGWIN__) || defined(__CYGWIN32__)) + /* Cygwin under Windows. ------------------------------------ */ + /* New 64-bit MEMORYSTATUSEX isn't available. Use old 32.bit */ + MEMORYSTATUS status; + status.dwLength = sizeof(status); + GlobalMemoryStatus( &status ); + return (size_t)status.dwTotalPhys; + +#elif defined(_WIN32) + /* Windows. ------------------------------------------------- */ + /* Use new 64-bit MEMORYSTATUSEX, not old 32-bit MEMORYSTATUS */ + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx( &status ); + return (size_t)status.ullTotalPhys; + +#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) + /* UNIX variants. ------------------------------------------- */ + /* Prefer sysctl() over sysconf() except sysctl() HW_REALMEM and HW_PHYSMEM */ + +#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) + int mib[2]; + mib[0] = CTL_HW; +#if defined(HW_MEMSIZE) + mib[1] = HW_MEMSIZE; /* OSX. --------------------- */ +#elif defined(HW_PHYSMEM64) + mib[1] = HW_PHYSMEM64; /* NetBSD, OpenBSD. --------- */ +#endif + int64_t size = 0; /* 64-bit */ + size_t len = sizeof( size ); + if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 ) + return (size_t)size; + return 0L; /* Failed? */ + +#elif defined(_SC_AIX_REALMEM) + /* AIX. ----------------------------------------------------- */ + return (size_t)sysconf( _SC_AIX_REALMEM ) * (size_t)1024L; + +#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) + /* FreeBSD, Linux, OpenBSD, and Solaris. -------------------- */ + return (size_t)sysconf( _SC_PHYS_PAGES ) * + (size_t)sysconf( _SC_PAGESIZE ); + +#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) + /* Legacy. -------------------------------------------------- */ + return (size_t)sysconf( _SC_PHYS_PAGES ) * + (size_t)sysconf( _SC_PAGE_SIZE ); + +#elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) + /* DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX. -------- */ + int mib[2]; + mib[0] = CTL_HW; +#if defined(HW_REALMEM) + mib[1] = HW_REALMEM; /* FreeBSD. ----------------- */ +#elif defined(HW_PYSMEM) + mib[1] = HW_PHYSMEM; /* Others. ------------------ */ +#endif + unsigned int size = 0; /* 32-bit */ + size_t len = sizeof( size ); + if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 ) + return (size_t)size; + return 0L; /* Failed? */ +#endif /* sysctl and sysconf variants */ + +#else + return 0L; /* Unknown OS. */ +#endif +} + +#endif // NOMEMORYSIZE +} // namespace common diff --git a/src/backend/host_memory.hpp b/src/backend/host_memory.hpp new file mode 100644 index 0000000000..5955cbfbd9 --- /dev/null +++ b/src/backend/host_memory.hpp @@ -0,0 +1,18 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include + +namespace common +{ + +size_t getHostMemorySize(); + +} diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index 0860098c9f..002c1d5b82 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -16,41 +16,42 @@ #include #include #include +#include +#include +#include using af::dim4; namespace opencl { - - const int MAX_JIT_LEN = 20; using JIT::BufferNode; using JIT::Node; using JIT::Node_ptr; template Array::Array(af::dim4 dims) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(bufferAlloc(info.elements() * sizeof(T)), bufferFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { } template Array::Array(af::dim4 dims, JIT::Node_ptr n) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(), data_dims(dims), - node(n), offset(0), ready(false), owner(true) + node(n), ready(false), owner(true) { } template Array::Array(af::dim4 dims, const T * const in_data) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(bufferAlloc(info.elements()*sizeof(T)), bufferFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); static_assert(offsetof(Array, info) == 0, "Array::info must be the first member variable of Array"); @@ -59,10 +60,10 @@ namespace opencl template Array::Array(af::dim4 dims, cl_mem mem, size_t src_offset, bool copy) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(copy ? bufferAlloc(info.elements() * sizeof(T)) : new cl::Buffer(mem), bufferFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { if (copy) { clRetainMemObject(mem); @@ -74,12 +75,11 @@ namespace opencl } template - Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, const dim4 &stride) : - info(parent.getDevId(), dims, offsets, stride, (af_dtype)dtype_traits::af_type), + Array::Array(const Array& parent, const dim4 &dims, const dim_t &offset_, const dim4 &stride) : + info(parent.getDevId(), dims, offset_, stride, (af_dtype)dtype_traits::af_type), data(parent.getData()), data_dims(parent.getDataDims()), node(), - offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), ready(true), owner(false) { } @@ -87,15 +87,33 @@ namespace opencl template Array::Array(Param &tmp) : - info(getActiveDeviceId(), af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3]), - af::dim4(0, 0, 0, 0), - af::dim4(tmp.info.strides[0], tmp.info.strides[1], - tmp.info.strides[2], tmp.info.strides[3]), - (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), + af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3]), + 0, + af::dim4(tmp.info.strides[0], tmp.info.strides[1], + tmp.info.strides[2], tmp.info.strides[3]), + (af_dtype)dtype_traits::af_type), data(tmp.data, bufferFree), data_dims(af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3])), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) + { + } + + template + Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, + const T * const in_data, bool is_device) : + info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), + data(is_device ? + (new cl::Buffer((cl_mem)in_data)) : + (bufferAlloc(info.total() * sizeof(T))), bufferFree), + data_dims(dims), + node(), + ready(true), + owner(true) { + if (!is_device) { + getQueue().enqueueWriteBuffer(*data.get(), CL_TRUE, 0, sizeof(T) * info.total(), in_data); + } } @@ -166,9 +184,9 @@ namespace opencl n->getInfo(length, buf_count, bytes); n->resetFlags(); - if (length > MAX_JIT_LEN || - buf_count >= MAX_BUFFERS || - bytes >= MAX_BYTES) { + if (length > getMaxJitSize() || + buf_count >= getMaxBuffers() || + bytes >= getMaxBytes()) { out.eval(); } @@ -185,18 +203,23 @@ namespace opencl dim4 dDims = parent.getDataDims(); dim4 pDims = parent.dims(); - dim4 dims = toDims (index, pDims); - dim4 offset = toOffset(index, dDims); - dim4 stride = toStride (index, dDims); + dim4 dims = toDims (index, pDims); + dim4 strides = toStride (index, dDims); - Array out = Array(parent, dims, offset, stride); + // Find total offsets after indexing + dim4 offsets = toOffset(index, pDims); + dim4 parent_strides = parent.strides(); + dim_t offset = parent.getOffset(); + for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i]; + + Array out = Array(parent, dims, offset, strides); if (!copy) return out; - if (stride[0] != 1 || - stride[1] < 0 || - stride[2] < 0 || - stride[3] < 0) { + if (strides[0] != 1 || + strides[1] < 0 || + strides[2] < 0 || + strides[3] < 0) { out = copyArray(out); } @@ -258,18 +281,12 @@ namespace opencl delete A; } - template - void evalArray(const Array &A) - { - A.eval(); - } - template void writeHostDataArray(Array &arr, const T * const data, const size_t bytes) { if (!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } getQueue().enqueueWriteBuffer(*arr.get(), CL_TRUE, @@ -285,7 +302,7 @@ namespace opencl writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) { if (!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } cl::Buffer& buf = *arr.get(); @@ -312,10 +329,13 @@ namespace opencl const std::vector &index, \ bool copy); \ template void destroyArray (Array *A); \ - template void evalArray (const Array &A); \ template Array createNodeArray (const dim4 &size, JIT::Node_ptr node); \ + template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \ + const T * const in_data, \ + bool is_device); \ template Array::Array(af::dim4 dims, cl_mem mem, size_t src_offset, bool copy); \ template Array::~Array (); \ + template Node_ptr Array::getNode() const; \ template void Array::eval(); \ template void Array::eval() const; \ template void writeHostDataArray (Array &arr, const T * const data, const size_t bytes); \ diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index 1db0ab6347..8c5bda90de 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace opencl { @@ -69,9 +70,6 @@ namespace opencl const std::vector &index, bool copy=true); - template - void evalArray(const Array &A); - // Creates a new Array object on the heap and returns a reference to it. template void destroyArray(Array *A); @@ -80,10 +78,16 @@ namespace opencl void *getDevicePtr(const Array& arr) { cl::Buffer *buf = arr.device(); - memPop((T *)buf); + memLock((T *)buf); return (void *)((*buf)()); } + template + void *getRawPtr(const Array& arr) + { + return (void *)(arr.get()); + } + template class Array { @@ -92,12 +96,12 @@ namespace opencl af::dim4 data_dims; JIT::Node_ptr node; - dim_t offset; bool ready; bool owner; Array(af::dim4 dims); - Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); + + Array(const Array& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride); Array(Param &tmp); explicit Array(af::dim4 dims, JIT::Node_ptr n); explicit Array(af::dim4 dims, const T * const in_data); @@ -105,6 +109,9 @@ namespace opencl public: + Array(af::dim4 dims, af::dim4 strides, dim_t offset, + const T * const in_data, bool is_device = false); + void resetInfo(const af::dim4& dims) { info.resetInfo(dims); } void resetDims(const af::dim4& dims) { info.resetDims(dims); } void modDims(const af::dim4 &newDims) { info.modDims(newDims); } @@ -115,7 +122,6 @@ namespace opencl RET_TYPE NAME() const { return info.NAME(); } INFO_FUNC(const af_dtype& ,getType) - INFO_FUNC(const af::dim4& ,offsets) INFO_FUNC(const af::dim4& ,strides) INFO_FUNC(size_t ,elements) INFO_FUNC(size_t ,ndims) @@ -185,7 +191,7 @@ namespace opencl const dim_t getOffset() const { - return offset; + return info.getOffset(); } Buffer_ptr getData() const @@ -200,6 +206,11 @@ namespace opencl return isOwner() ? dims() : data_dims; } + void setDataDims(const dim4 &new_dims) + { + data_dims = new_dims; + } + operator Param() const { KParam info = {{dims()[0], dims()[1], dims()[2], dims()[3]}, @@ -212,6 +223,35 @@ namespace opencl JIT::Node_ptr getNode() const; + public: + std::shared_ptr getMappedPtr() const + { + auto func = [=] (void* ptr) { + try { + if(ptr != nullptr) + getQueue().enqueueUnmapMemObject(*data, ptr); + ptr = nullptr; + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); + } + }; + + T *ptr = nullptr; + try { + if(ptr == nullptr) { + ptr = (T*)getQueue().enqueueMapBuffer(*const_cast(get()), + true, CL_MAP_READ|CL_MAP_WRITE, + getOffset(), + (getDataDims().elements() - getOffset()) + * sizeof(T)); + } + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); + } + + return std::shared_ptr(ptr, func); + } + friend Array createValueArray(const af::dim4 &size, const T& value); friend Array createHostDataArray(const af::dim4 &size, const T * const data); friend Array createDeviceDataArray(const af::dim4 &size, const void *data); @@ -226,8 +266,8 @@ namespace opencl bool copy); friend void destroyArray(Array *arr); - friend void evalArray(const Array &arr); friend void *getDevicePtr(const Array& arr); + friend void *getRawPtr(const Array& arr); }; } diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index 86ba1b2aad..bbe430df15 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -10,13 +10,26 @@ IF(USE_OPENCL_F77_BLAS) ADD_DEFINITIONS(-DUSE_F77_BLAS) ENDIF() -IF(USE_OPENCL_MKL) - MESSAGE("Using MKL") +IF(USE_OPENCL_MKL) # Manual MKL Setup + MESSAGE("OpenCL Backend Using MKL") ADD_DEFINITIONS(-DUSE_MKL) +ELSE(USE_OPENCL_MKL) + IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS + MESSAGE("OpenCL Backend Using MKL RT") + ADD_DEFINITIONS(-DUSE_MKL) + ENDIF() ENDIF() IF(APPLE) - FIND_PACKAGE(LAPACK) + FIND_PACKAGE(LAPACKE QUIET) # For finding MKL + IF(NOT LAPACK_FOUND) + # UNSET THE VARIABLES FROM LAPACKE + UNSET(LAPACKE_LIB CACHE) + UNSET(LAPACK_LIB CACHE) + UNSET(LAPACKE_INCLUDES CACHE) + UNSET(LAPACKE_ROOT_DIR CACHE) + FIND_PACKAGE(LAPACK) + ENDIF() ELSE(APPLE) # Linux and Windows FIND_PACKAGE(LAPACKE) ENDIF(APPLE) @@ -123,6 +136,12 @@ FILE(GLOB conv_ker_headers FILE(GLOB conv_ker_sources "kernel/convolve/*.cpp") +FILE(GLOB cpu_headers + "cpu/*.hpp") + +FILE(GLOB cpu_sources + "cpu/*.cpp") + source_group(backend\\opencl\\Headers FILES ${opencl_headers}) source_group(backend\\opencl\\Sources FILES ${opencl_sources}) source_group(backend\\opencl\\JIT FILES ${jit_sources}) @@ -131,6 +150,8 @@ source_group(backend\\opencl\\kernel\\cl FILES ${opencl_kernels}) source_group(backend\\opencl\\kernel\\Sources FILES ${kernel_sources}) source_group(backend\\opencl\\kernel\\convolve\\Headers FILES ${conv_ker_headers}) source_group(backend\\opencl\\kernel\\convolve\\Sources FILES ${conv_ker_sources}) +source_group(backend\\opencl\\cpu\\Headers FILES ${cpu_headers}) +source_group(backend\\opencl\\cpu\\Sources FILES ${cpu_sources}) IF(LAPACK_FOUND) FILE(GLOB magma_sources @@ -189,10 +210,6 @@ CL_KERNEL_TO_H( # OS Definitions IF(UNIX) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread -Wno-comment") -ELSE(${UNIX}) #Windows - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") - SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj") ENDIF() IF(DEFINED BLAS_SYM_FILE) @@ -206,6 +223,8 @@ IF(DEFINED BLAS_SYM_FILE) ${kernel_sources} ${conv_ker_headers} ${conv_ker_sources} + ${cpu_headers} + ${cpu_sources} ${backend_headers} ${backend_sources} ${magma_sources} @@ -244,6 +263,8 @@ ELSE(DEFINED BLAS_SYM_FILE) ${kernel_sources} ${conv_ker_headers} ${conv_ker_sources} + ${cpu_sources} + ${cpu_sources} ${backend_headers} ${backend_sources} ${c_headers} diff --git a/src/backend/opencl/JIT/BinaryNode.hpp b/src/backend/opencl/JIT/BinaryNode.hpp index f087760b87..b1f6d112b7 100644 --- a/src/backend/opencl/JIT/BinaryNode.hpp +++ b/src/backend/opencl/JIT/BinaryNode.hpp @@ -51,6 +51,9 @@ namespace JIT int setArgs(cl::Kernel &ker, int id) { + if (m_set_arg) return id; + m_set_arg = true; + id = m_lhs->setArgs(ker, id); id = m_rhs->setArgs(ker, id); return id; @@ -120,10 +123,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; + resetCommonFlags(); m_lhs->resetFlags(); m_rhs->resetFlags(); } diff --git a/src/backend/opencl/JIT/BufferNode.hpp b/src/backend/opencl/JIT/BufferNode.hpp index 71723b99df..9306d59ef5 100644 --- a/src/backend/opencl/JIT/BufferNode.hpp +++ b/src/backend/opencl/JIT/BufferNode.hpp @@ -24,7 +24,6 @@ namespace JIT const std::shared_ptr m_data; const Param m_param; const unsigned m_bytes; - bool m_set_arg; bool m_linear; public: @@ -39,7 +38,6 @@ namespace JIT m_data(data), m_param(param), m_bytes(bytes), - m_set_arg(false), m_linear(is_linear) {} @@ -140,12 +138,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_gen_name = false; - m_set_arg = false; + resetCommonFlags(); } }; diff --git a/src/backend/opencl/JIT/Node.hpp b/src/backend/opencl/JIT/Node.hpp index fedf7fb9bd..fc34c09c19 100644 --- a/src/backend/opencl/JIT/Node.hpp +++ b/src/backend/opencl/JIT/Node.hpp @@ -32,8 +32,20 @@ namespace JIT bool m_gen_func; bool m_gen_param; bool m_gen_offset; + bool m_set_arg; bool m_gen_name; + protected: + void resetCommonFlags() + { + m_set_id = false; + m_gen_func = false; + m_gen_param = false; + m_gen_offset = false; + m_set_arg = false; + m_gen_name = false; + } + public: Node(const char *type_str, const char *name_str) @@ -44,6 +56,7 @@ namespace JIT m_gen_func(false), m_gen_param(false), m_gen_offset(false), + m_set_arg(false), m_gen_name(false) {} @@ -64,7 +77,10 @@ namespace JIT } - virtual void resetFlags() {} + virtual void resetFlags() + { + resetCommonFlags(); + } virtual bool isLinear(dim_t dims[4]) { return true; } diff --git a/src/backend/opencl/JIT/ScalarNode.hpp b/src/backend/opencl/JIT/ScalarNode.hpp index 9eaa544134..0bba7a2fc9 100644 --- a/src/backend/opencl/JIT/ScalarNode.hpp +++ b/src/backend/opencl/JIT/ScalarNode.hpp @@ -24,14 +24,12 @@ namespace JIT { private: const T m_val; - bool m_set_arg; public: ScalarNode(T val) : Node(dtype_traits::getName(), shortname(false)), - m_val(val), - m_set_arg(false) + m_val(val) { } @@ -101,12 +99,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_gen_name = false; - m_set_arg = false; + resetCommonFlags(); } }; diff --git a/src/backend/opencl/JIT/UnaryNode.hpp b/src/backend/opencl/JIT/UnaryNode.hpp index 78fda23e92..e1f32ded8f 100644 --- a/src/backend/opencl/JIT/UnaryNode.hpp +++ b/src/backend/opencl/JIT/UnaryNode.hpp @@ -49,6 +49,8 @@ namespace JIT int setArgs(cl::Kernel &ker, int id) { + if (m_set_arg) return id; + m_set_arg = true; return m_child->setArgs(ker, id); } @@ -108,10 +110,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; + resetCommonFlags(); m_child->resetFlags(); } }; diff --git a/src/backend/opencl/binary.hpp b/src/backend/opencl/binary.hpp index 4f58cb49e6..11493a5966 100644 --- a/src/backend/opencl/binary.hpp +++ b/src/backend/opencl/binary.hpp @@ -22,7 +22,7 @@ namespace opencl { const char *name() { - return "noop"; + return "__invalid"; } }; diff --git a/src/backend/opencl/blas.cpp b/src/backend/opencl/blas.cpp index 6173a684ea..77531154e5 100644 --- a/src/backend/opencl/blas.cpp +++ b/src/backend/opencl/blas.cpp @@ -19,6 +19,13 @@ #include #include #include +#include +#include +#include + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#endif namespace opencl { @@ -113,6 +120,12 @@ template Array matmul(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) + if(OpenCLCPUOffload(false)) { // Do not force offload gemm on OSX Intel devices + return cpu::matmul(lhs, rhs, optLhs, optRhs); + } +#endif + initBlas(); clblasTranspose lOpts = toClblasTranspose(optLhs); clblasTranspose rOpts = toClblasTranspose(optRhs); @@ -168,45 +181,15 @@ Array matmul(const Array &lhs, const Array &rhs, return out; } -template -Array dot_(const Array &lhs, const Array &rhs, - af_mat_prop optLhs, af_mat_prop optRhs) -{ - initBlas(); - - int N = lhs.dims()[0]; - dot_func dot; - cl::Event event; - Array out = createEmptyArray(af::dim4(1)); - cl::Buffer scratch(getContext(), CL_MEM_READ_WRITE, sizeof(T) * N); - CLBLAS_CHECK( - dot(N, - (*out.get())(), out.getOffset(), - (*lhs.get())(), lhs.getOffset(), lhs.strides()[0], - (*rhs.get())(), rhs.getOffset(), rhs.strides()[0], - scratch(), - 1, &getQueue()(), 0, nullptr, &event()) - ); - - if(both_conjugate) - transpose_inplace(out, true); - - return out; -} - template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { - if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - return dot_(lhs, rhs, optLhs, optRhs); - } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - return dot_(lhs, rhs, optLhs, optRhs); - } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - return dot_(rhs, lhs, optRhs, optLhs); - } else { - return dot_(lhs, rhs, optLhs, optRhs); - } + const Array lhs_ = (optLhs == AF_MAT_NONE ? lhs : conj(lhs)); + const Array rhs_ = (optRhs == AF_MAT_NONE ? rhs : conj(rhs)); + + const Array temp = arithOp(lhs_, rhs_, lhs_.dims()); + return reduce(temp, 0, false, 0); } #define INSTANTIATE_BLAS(TYPE) \ diff --git a/src/backend/opencl/cholesky.cpp b/src/backend/opencl/cholesky.cpp index 78fe999645..a2034a331a 100644 --- a/src/backend/opencl/cholesky.cpp +++ b/src/backend/opencl/cholesky.cpp @@ -8,14 +8,16 @@ ********************************************************/ #include -#include #include -#include #include +#include +#include #if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include +#include +#include namespace opencl { @@ -24,6 +26,10 @@ template int cholesky_inplace(Array &in, const bool is_upper) { try { + if(OpenCLCPUOffload()) { + return cpu::cholesky_inplace(in, is_upper); + } + initBlas(); dim4 iDims = in.dims(); @@ -46,6 +52,9 @@ template Array cholesky(int *info, const Array &in, const bool is_upper) { try { + if(OpenCLCPUOffload()) { + return cpu::cholesky(info, in, is_upper); + } Array out = copyArray(in); *info = cholesky_inplace(out, is_upper); diff --git a/src/backend/opencl/copy.cpp b/src/backend/opencl/copy.cpp index 39cbf4b59d..e1716f1632 100644 --- a/src/backend/opencl/copy.cpp +++ b/src/backend/opencl/copy.cpp @@ -29,7 +29,7 @@ namespace opencl cl::Buffer buf; Array out = A; - if (A.isOwner() || // No offsets, No strides + if (A.isLinear() || // No offsets, No strides A.ndims() == 1 // Simple offset, no strides. ) { buf = *A.get(); diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp new file mode 100644 index 0000000000..724c6bb1e9 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_blas.cpp @@ -0,0 +1,210 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +using std::add_const; +using std::add_pointer; +using std::enable_if; +using std::is_floating_point; +using std::remove_const; +using std::conditional; + +// Some implementations of BLAS require void* for complex pointers while others use float*/double* +// +// Sample cgemm API +// OpenBLAS +// void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, +// OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, +// OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, +// OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, +// float *C, OPENBLAS_CONST blasint ldc); +// +// MKL +// void cblas_cgemm(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, +// const MKL_INT M, const MKL_INT N, const MKL_INT K, +// const void *alpha, const void *A, const MKL_INT lda, +// const void *B, const MKL_INT ldb, const void *beta, +// void *C, const MKL_INT ldc); +// atlas cblas +// void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, +// const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, +// const void *alpha, const void *A, const int lda, +// const void *B, const int ldb, const void *beta, +// void *C, const int ldc); +// +// LAPACKE +// void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, +// const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, +// const void *alpha, const void *A, const int lda, +// const void *B, const int ldb, const void *beta, +// void *C, const int ldc); +#if defined(IS_OPENBLAS) + static const bool cplx_void_ptr = false; +#else + static const bool cplx_void_ptr = true; +#endif + +template +struct blas_base { + using type = typename dtype_traits::base_type; +}; + +template +struct blas_base ::value && cplx_void_ptr>::type> { + using type = void; +}; + + +template +using cptr_type = typename conditional< is_complex::value, + const typename blas_base::type *, + const T*>::type; +template +using ptr_type = typename conditional< is_complex::value, + typename blas_base::type *, + T*>::type; +template +using scale_type = typename conditional< is_complex::value, + const typename blas_base::type *, + const T>::type; + +template +using gemm_func_def = void (*)( const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, + const blasint, const blasint, const blasint, + scale_type, cptr_type, const blasint, + cptr_type, const blasint, + scale_type, ptr_type, const blasint); + +template +using gemv_func_def = void (*)( const CBLAS_ORDER, const CBLAS_TRANSPOSE, + const blasint, const blasint, + scale_type, cptr_type, const blasint, + cptr_type, const blasint, + scale_type, ptr_type, const blasint); + +#define BLAS_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + +#define BLAS_FUNC( FUNC, TYPE, PREFIX ) \ + template<> FUNC##_func_def FUNC##_func() \ +{ return &cblas_##PREFIX##FUNC; } + +BLAS_FUNC_DEF( gemm ) +BLAS_FUNC(gemm , float , s) +BLAS_FUNC(gemm , double , d) +BLAS_FUNC(gemm , cfloat , c) +BLAS_FUNC(gemm , cdouble , z) + +BLAS_FUNC_DEF(gemv) +BLAS_FUNC(gemv , float , s) +BLAS_FUNC(gemv , double , d) +BLAS_FUNC(gemv , cfloat , c) +BLAS_FUNC(gemv , cdouble , z) + +template +typename enable_if::value, scale_type>::type +getScale() { return T(value); } + +template +typename enable_if::value, scale_type>::type +getScale() +{ + static T val = scalar(value); + return (const typename blas_base::type *)&val; +} + +CBLAS_TRANSPOSE +toCblasTranspose(af_mat_prop opt) +{ + CBLAS_TRANSPOSE out = CblasNoTrans; + switch(opt) { + case AF_MAT_NONE : out = CblasNoTrans; break; + case AF_MAT_TRANS : out = CblasTrans; break; + case AF_MAT_CTRANS : out = CblasConjTrans; break; + default : AF_ERROR("INVALID af_mat_prop", AF_ERR_ARG); + } + return out; +} + +template +Array matmul(const Array &lhs, const Array &rhs, + af_mat_prop optLhs, af_mat_prop optRhs) +{ + CBLAS_TRANSPOSE lOpts = toCblasTranspose(optLhs); + CBLAS_TRANSPOSE rOpts = toCblasTranspose(optRhs); + + int aRowDim = (lOpts == CblasNoTrans) ? 0 : 1; + int aColDim = (lOpts == CblasNoTrans) ? 1 : 0; + int bColDim = (rOpts == CblasNoTrans) ? 1 : 0; + + dim4 lDims = lhs.dims(); + dim4 rDims = rhs.dims(); + int M = lDims[aRowDim]; + int N = rDims[bColDim]; + int K = lDims[aColDim]; + + //FIXME: Leaks on errors. + Array out = createValueArray(af::dim4(M, N, 1, 1), scalar(0)); + auto alpha = getScale(); + auto beta = getScale(); + + dim4 lStrides = lhs.strides(); + dim4 rStrides = rhs.strides(); + using BT = typename blas_base::type; + + // get host pointers from mapped memory + auto lPtr = lhs.getMappedPtr(); + auto rPtr = rhs.getMappedPtr(); + auto oPtr = out.getMappedPtr(); + + if(rDims[bColDim] == 1) { + N = lDims[aColDim]; + gemv_func()( + CblasColMajor, lOpts, + lDims[0], lDims[1], + alpha, + (BT*)lPtr.get(), lStrides[1], + (BT*)rPtr.get(), rStrides[0], + beta, + (BT*)oPtr.get(), 1); + } else { + gemm_func()( + CblasColMajor, lOpts, rOpts, + M, N, K, + alpha, + (BT*)lPtr.get(), lStrides[1], + (BT*)rPtr.get(), rStrides[1], + beta, + (BT*)oPtr.get(), out.dims()[0]); + } + + return out; +} + +#define INSTANTIATE_BLAS(TYPE) \ + template Array matmul(const Array &lhs, const Array &rhs, \ + af_mat_prop optLhs, af_mat_prop optRhs); + +INSTANTIATE_BLAS(float) +INSTANTIATE_BLAS(cfloat) +INSTANTIATE_BLAS(double) +INSTANTIATE_BLAS(cdouble) + +} +} +#endif diff --git a/src/backend/opencl/cpu/cpu_blas.hpp b/src/backend/opencl/cpu/cpu_blas.hpp new file mode 100644 index 0000000000..908742471d --- /dev/null +++ b/src/backend/opencl/cpu/cpu_blas.hpp @@ -0,0 +1,20 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + Array matmul(const Array &lhs, const Array &rhs, + af_mat_prop optLhs, af_mat_prop optRhs); +} +} diff --git a/src/backend/opencl/cpu/cpu_cholesky.cpp b/src/backend/opencl/cpu/cpu_cholesky.cpp new file mode 100644 index 0000000000..9acbcc4fad --- /dev/null +++ b/src/backend/opencl/cpu/cpu_cholesky.cpp @@ -0,0 +1,84 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +template +using potrf_func_def = int (*)(ORDER_TYPE, char, + int, + T*, int); + +#define CH_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + + +#define CH_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +CH_FUNC_DEF( potrf ) +CH_FUNC(potrf , float , s) +CH_FUNC(potrf , double , d) +CH_FUNC(potrf , cfloat , c) +CH_FUNC(potrf , cdouble, z) + +template +Array cholesky(int *info, const Array &in, const bool is_upper) +{ + Array out = copyArray(in); + *info = cholesky_inplace(out, is_upper); + + std::shared_ptr oPtr = out.getMappedPtr(); + + if (is_upper) triangle(oPtr.get(), oPtr.get(), out.dims(), out.strides(), out.strides()); + else triangle(oPtr.get(), oPtr.get(), out.dims(), out.strides(), out.strides()); + + return out; +} + +template +int cholesky_inplace(Array &in, const bool is_upper) +{ + dim4 iDims = in.dims(); + int N = iDims[0]; + + char uplo = 'L'; + if(is_upper) + uplo = 'U'; + + std::shared_ptr inPtr = in.getMappedPtr(); + + int info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, + N, inPtr.get(), in.strides()[1]); + + return info; +} + +#define INSTANTIATE_CH(T) \ + template int cholesky_inplace(Array &in, const bool is_upper); \ + template Array cholesky (int *info, const Array &in, const bool is_upper); \ + + +INSTANTIATE_CH(float) +INSTANTIATE_CH(cfloat) +INSTANTIATE_CH(double) +INSTANTIATE_CH(cdouble) + +} +} +#endif diff --git a/src/backend/opencl/cpu/cpu_cholesky.hpp b/src/backend/opencl/cpu/cpu_cholesky.hpp new file mode 100644 index 0000000000..041e93980e --- /dev/null +++ b/src/backend/opencl/cpu/cpu_cholesky.hpp @@ -0,0 +1,22 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + Array cholesky(int *info, const Array &in, const bool is_upper); + + template + int cholesky_inplace(Array &in, const bool is_upper); +} +} diff --git a/src/backend/opencl/cpu/cpu_helper.hpp b/src/backend/opencl/cpu/cpu_helper.hpp new file mode 100644 index 0000000000..f7f690322c --- /dev/null +++ b/src/backend/opencl/cpu/cpu_helper.hpp @@ -0,0 +1,74 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#ifndef AF_OPENCL_CPU +#define AF_OPENCL_CPU + +#include +#include +#include +#include +#include +#include + +//********************************************************/ +// LAPACK +//********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) + +#define lapack_complex_float opencl::cfloat +#define lapack_complex_double opencl::cdouble +#define LAPACK_PREFIX LAPACKE_ +#define ORDER_TYPE int +#define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR +#define LAPACK_NAME(fn) LAPACKE_##fn + +#ifdef USE_MKL + #include +#else + #ifdef __APPLE__ + #include + #include + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 + #else // NETLIB LAPACKE + #include + #endif +#endif + +#endif // WITH_OPENCL_LINEAR_ALGEBRA + +//********************************************************/ +// BLAS +//********************************************************/ +#ifdef USE_MKL + #include +#else + #ifdef __APPLE__ + #include + #else + extern "C" { + #include + } + #endif +#endif + +// TODO: Ask upstream for a more official way to detect it +#ifdef OPENBLAS_CONST +#define IS_OPENBLAS +#endif + +// Make sure we get the correct type signature for OpenBLAS +// OpenBLAS defines blasint as it's index type. Emulate this +// if we're not dealing with openblas and use it where applicable +#ifndef IS_OPENBLAS +typedef int blasint; +#endif + +#endif diff --git a/src/backend/opencl/cpu/cpu_inverse.cpp b/src/backend/opencl/cpu/cpu_inverse.cpp new file mode 100644 index 0000000000..4f73a80707 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_inverse.cpp @@ -0,0 +1,76 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +template +using getri_func_def = int (*)(ORDER_TYPE, int, + T *, int, + const int *); + +#define INV_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + +#define INV_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +INV_FUNC_DEF( getri ) +INV_FUNC(getri , float , s) +INV_FUNC(getri , double , d) +INV_FUNC(getri , cfloat , c) +INV_FUNC(getri , cdouble, z) + +template +Array inverse(const Array &in) +{ + int M = in.dims()[0]; + //int N = in.dims()[1]; + + // This condition is already handled in opencl/inverse.cpp + //if (M != N) { + //Array I = identity(in.dims()); + //return solve(in, I); + //} + + Array A = copyArray(in); + + Array pivot = cpu::lu_inplace(A, false); + + + std::shared_ptr aPtr = A.getMappedPtr(); + std::shared_ptr pPtr = pivot.getMappedPtr(); + + getri_func()(AF_LAPACK_COL_MAJOR, M, + aPtr.get(), A.strides()[1], + pPtr.get()); + + return A; +} + +#define INSTANTIATE(T) \ + template Array inverse (const Array &in); + +INSTANTIATE(float) +INSTANTIATE(cfloat) +INSTANTIATE(double) +INSTANTIATE(cdouble) + +} +} +#endif diff --git a/src/backend/opencl/cpu/cpu_inverse.hpp b/src/backend/opencl/cpu/cpu_inverse.hpp new file mode 100644 index 0000000000..38581a1906 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_inverse.hpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + Array inverse(const Array &in); +} +} diff --git a/src/backend/opencl/cpu/cpu_lu.cpp b/src/backend/opencl/cpu/cpu_lu.cpp new file mode 100644 index 0000000000..e0234fb7de --- /dev/null +++ b/src/backend/opencl/cpu/cpu_lu.cpp @@ -0,0 +1,178 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +template +using getrf_func_def = int (*)(ORDER_TYPE, int, int, + T*, int, + int*); + +#define LU_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + + +#define LU_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +LU_FUNC_DEF( getrf ) +LU_FUNC(getrf , float , s) +LU_FUNC(getrf , double , d) +LU_FUNC(getrf , cfloat , c) +LU_FUNC(getrf , cdouble, z) + +template +void lu_split(Array &lower, Array &upper, const Array &in) +{ + std::shared_ptr ls = lower.getMappedPtr(); + std::shared_ptr us = upper.getMappedPtr(); + std::shared_ptr is = in.getMappedPtr(); + + T *l = ls.get(); + T *u = us.get(); + T *i = is.get(); + + dim4 ldm = lower.dims(); + dim4 udm = upper.dims(); + dim4 idm = in.dims(); + + dim4 lst = lower.strides(); + dim4 ust = upper.strides(); + dim4 ist = in.strides(); + + for(dim_t ow = 0; ow < idm[3]; ow++) { + const dim_t lW = ow * lst[3]; + const dim_t uW = ow * ust[3]; + const dim_t iW = ow * ist[3]; + + for(dim_t oz = 0; oz < idm[2]; oz++) { + const dim_t lZW = lW + oz * lst[2]; + const dim_t uZW = uW + oz * ust[2]; + const dim_t iZW = iW + oz * ist[2]; + + for(dim_t oy = 0; oy < idm[1]; oy++) { + const dim_t lYZW = lZW + oy * lst[1]; + const dim_t uYZW = uZW + oy * ust[1]; + const dim_t iYZW = iZW + oy * ist[1]; + + for(dim_t ox = 0; ox < idm[0]; ox++) { + const dim_t lMem = lYZW + ox; + const dim_t uMem = uYZW + ox; + const dim_t iMem = iYZW + ox; + if(ox > oy) { + if(oy < ldm[1]) + l[lMem] = i[iMem]; + if(ox < udm[0]) + u[uMem] = scalar(0); + } else if (oy > ox) { + if(oy < ldm[1]) + l[lMem] = scalar(0); + if(ox < udm[0]) + u[uMem] = i[iMem]; + } else if(ox == oy) { + if(oy < ldm[1]) + l[lMem] = scalar(1.0); + if(ox < udm[0]) + u[uMem] = i[iMem]; + } + } + } + } + } +} + +void convertPivot(Array &pivot, int out_sz) +{ + Array p = range(dim4(out_sz), 0); // Runs opencl + + std::shared_ptr pi = pivot.getMappedPtr(); + std::shared_ptr po = p.getMappedPtr(); + + int *d_pi = pi.get(); + int *d_po = po.get(); + + dim_t d0 = pivot.dims()[0]; + + for(int j = 0; j < (int)d0; j++) { + // 1 indexed in pivot + std::swap(d_po[j], d_po[d_pi[j] - 1]); + } + + pi.reset(); + po.reset(); + + pivot = p; +} + +template +void lu(Array &lower, Array &upper, Array &pivot, const Array &in) +{ + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + Array in_copy = copyArray(in); + pivot = lu_inplace(in_copy); + + // SPLIT into lower and upper + dim4 ldims(M, min(M, N)); + dim4 udims(min(M, N), N); + lower = createEmptyArray(ldims); + upper = createEmptyArray(udims); + + lu_split(lower, upper, in_copy); +} + +template +Array lu_inplace(Array &in, const bool convert_pivot) +{ + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + Array pivot = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); + + std::shared_ptr inPtr = in.getMappedPtr(); + std::shared_ptr piPtr = pivot.getMappedPtr(); + + getrf_func()(AF_LAPACK_COL_MAJOR, M, N, + inPtr.get(), in.strides()[1], + piPtr.get()); + + inPtr.reset(); + piPtr.reset(); + + if(convert_pivot) convertPivot(pivot, M); + + return pivot; +} + +#define INSTANTIATE_LU(T) \ + template Array lu_inplace(Array &in, const bool convert_pivot); \ + template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); + +INSTANTIATE_LU(float) +INSTANTIATE_LU(cfloat) +INSTANTIATE_LU(double) +INSTANTIATE_LU(cdouble) + +} +} +#endif diff --git a/src/backend/opencl/cpu/cpu_lu.hpp b/src/backend/opencl/cpu/cpu_lu.hpp new file mode 100644 index 0000000000..6c038f20c7 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_lu.hpp @@ -0,0 +1,22 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + void lu(Array &lower, Array &upper, Array &pivot, const Array &in); + + template + Array lu_inplace(Array &in, const bool convert_pivot = true); +} +} diff --git a/src/backend/opencl/cpu/cpu_qr.cpp b/src/backend/opencl/cpu/cpu_qr.cpp new file mode 100644 index 0000000000..737a7aec2f --- /dev/null +++ b/src/backend/opencl/cpu/cpu_qr.cpp @@ -0,0 +1,118 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +template +using geqrf_func_def = int (*)(ORDER_TYPE, int, int, + T*, int, + T*); + +template +using gqr_func_def = int (*)(ORDER_TYPE, int, int, int, + T*, int, + const T*); + +#define QR_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + + +#define QR_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +QR_FUNC_DEF( geqrf ) +QR_FUNC(geqrf , float , s) +QR_FUNC(geqrf , double , d) +QR_FUNC(geqrf , cfloat , c) +QR_FUNC(geqrf , cdouble, z) + +#define GQR_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + +#define GQR_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX); } + +GQR_FUNC_DEF( gqr ) +GQR_FUNC(gqr , float , sorgqr) +GQR_FUNC(gqr , double , dorgqr) +GQR_FUNC(gqr , cfloat , cungqr) +GQR_FUNC(gqr , cdouble, zungqr) + +template +void qr(Array &q, Array &r, Array &t, const Array &in) +{ + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + dim4 padDims(M, max(M, N)); + q = padArray(in, padDims, scalar(0)); + q.resetDims(iDims); + t = qr_inplace(q); + + // SPLIT into q and r + dim4 rdims(M, N); + r = createEmptyArray(rdims); + + std::shared_ptr qPtr = q.getMappedPtr(); + std::shared_ptr rPtr = r.getMappedPtr(); + std::shared_ptr tPtr = t.getMappedPtr(); + + triangle(rPtr.get(), qPtr.get(), rdims, r.strides(), q.strides()); + + gqr_func()(AF_LAPACK_COL_MAJOR, + M, M, min(M, N), + qPtr.get(), q.strides()[1], + tPtr.get()); + + q.resetDims(dim4(M, M)); +} + +template +Array qr_inplace(Array &in) +{ + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + Array t = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); + + std::shared_ptr iPtr = in.getMappedPtr(); + std::shared_ptr tPtr = t.getMappedPtr(); + + geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, + iPtr.get(), in.strides()[1], + tPtr.get()); + + return t; +} + +#define INSTANTIATE_QR(T) \ + template Array qr_inplace(Array &in); \ + template void qr(Array &q, Array &r, Array &t, const Array &in); + +INSTANTIATE_QR(float) +INSTANTIATE_QR(cfloat) +INSTANTIATE_QR(double) +INSTANTIATE_QR(cdouble) + +} +} +#endif diff --git a/src/backend/opencl/cpu/cpu_qr.hpp b/src/backend/opencl/cpu/cpu_qr.hpp new file mode 100644 index 0000000000..c499b9d03b --- /dev/null +++ b/src/backend/opencl/cpu/cpu_qr.hpp @@ -0,0 +1,22 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + void qr(Array &q, Array &r, Array &t, const Array &in); + + template + Array qr_inplace(Array &in); +} +} diff --git a/src/backend/opencl/cpu/cpu_solve.cpp b/src/backend/opencl/cpu/cpu_solve.cpp new file mode 100644 index 0000000000..1bb72f8768 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_solve.cpp @@ -0,0 +1,176 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +template +using gesv_func_def = int (*)(ORDER_TYPE, int, int, + T *, int, + int *, + T *, int); + +template +using gels_func_def = int (*)(ORDER_TYPE, char, + int, int, int, + T *, int, + T *, int); + +template +using getrs_func_def = int (*)(ORDER_TYPE, char, + int, int, + const T *, int, + const int *, + T *, int); + +template +using trtrs_func_def = int (*)(ORDER_TYPE, + char, char, char, + int, int, + const T *, int, + T *, int); + + +#define SOLVE_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + + +#define SOLVE_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +SOLVE_FUNC_DEF( gesv ) +SOLVE_FUNC(gesv , float , s) +SOLVE_FUNC(gesv , double , d) +SOLVE_FUNC(gesv , cfloat , c) +SOLVE_FUNC(gesv , cdouble, z) + +SOLVE_FUNC_DEF( gels ) +SOLVE_FUNC(gels , float , s) +SOLVE_FUNC(gels , double , d) +SOLVE_FUNC(gels , cfloat , c) +SOLVE_FUNC(gels , cdouble, z) + +SOLVE_FUNC_DEF( getrs ) +SOLVE_FUNC(getrs , float , s) +SOLVE_FUNC(getrs , double , d) +SOLVE_FUNC(getrs , cfloat , c) +SOLVE_FUNC(getrs , cdouble, z) + +SOLVE_FUNC_DEF( trtrs ) +SOLVE_FUNC(trtrs , float , s) +SOLVE_FUNC(trtrs , double , d) +SOLVE_FUNC(trtrs , cfloat , c) +SOLVE_FUNC(trtrs , cdouble, z) + +template +Array solveLU(const Array &A, const Array &pivot, + const Array &b, const af_mat_prop options) +{ + int N = A.dims()[0]; + int NRHS = b.dims()[1]; + + Array B = copyArray(b); + + std::shared_ptr aPtr = A.getMappedPtr(); + std::shared_ptr bPtr = B.getMappedPtr(); + std::shared_ptr pPtr = pivot.getMappedPtr(); + + getrs_func()(AF_LAPACK_COL_MAJOR, 'N', + N, NRHS, + aPtr.get(), A.strides()[1], + pPtr.get(), + bPtr.get(), B.strides()[1]); + + return B; +} + +template +Array triangleSolve(const Array &A, const Array &b, const af_mat_prop options) +{ + Array B = copyArray(b); + int N = B.dims()[0]; + int NRHS = B.dims()[1]; + + std::shared_ptr aPtr = A.getMappedPtr(); + std::shared_ptr bPtr = B.getMappedPtr(); + + trtrs_func()(AF_LAPACK_COL_MAJOR, + options & AF_MAT_UPPER ? 'U' : 'L', + 'N', // transpose flag + options & AF_MAT_DIAG_UNIT ? 'U' : 'N', + N, NRHS, + aPtr.get(), A.strides()[1], + bPtr.get(), B.strides()[1]); + + return B; +} + + +template +Array solve(const Array &a, const Array &b, const af_mat_prop options) +{ + + if (options & AF_MAT_UPPER || + options & AF_MAT_LOWER) { + return triangleSolve(a, b, options); + } + + int M = a.dims()[0]; + int N = a.dims()[1]; + int K = b.dims()[1]; + + Array A = copyArray(a); + Array B = padArray(b, dim4(max(M, N), K), scalar(0)); + + std::shared_ptr aPtr = A.getMappedPtr(); + std::shared_ptr bPtr = B.getMappedPtr(); + + if(M == N) { + std::vector pivot(N); + gesv_func()(AF_LAPACK_COL_MAJOR, N, K, + aPtr.get(), A.strides()[1], + &pivot.front(), + bPtr.get(), B.strides()[1]); + } else { + int sM = a.strides()[1]; + int sN = a.strides()[2] / sM; + + gels_func()(AF_LAPACK_COL_MAJOR, 'N', + M, N, K, + aPtr.get(), A.strides()[1], + bPtr.get(), max(sM, sN)); + B.resetDims(dim4(N, K)); + } + + return B; +} + +#define INSTANTIATE_SOLVE(T) \ + template Array solve(const Array &a, const Array &b, \ + const af_mat_prop options); \ + template Array solveLU(const Array &A, const Array &pivot, \ + const Array &b, const af_mat_prop options); \ + +INSTANTIATE_SOLVE(float) +INSTANTIATE_SOLVE(cfloat) +INSTANTIATE_SOLVE(double) +INSTANTIATE_SOLVE(cdouble) + +} +} +#endif diff --git a/src/backend/opencl/cpu/cpu_solve.hpp b/src/backend/opencl/cpu/cpu_solve.hpp new file mode 100644 index 0000000000..6c3de642ad --- /dev/null +++ b/src/backend/opencl/cpu/cpu_solve.hpp @@ -0,0 +1,23 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + Array solve(const Array &a, const Array &b, const af_mat_prop options = AF_MAT_NONE); + + template + Array solveLU(const Array &a, const Array &pivot, + const Array &b, const af_mat_prop options = AF_MAT_NONE); +} +} diff --git a/src/backend/opencl/cpu/cpu_svd.cpp b/src/backend/opencl/cpu/cpu_svd.cpp new file mode 100644 index 0000000000..3608bf69ce --- /dev/null +++ b/src/backend/opencl/cpu/cpu_svd.cpp @@ -0,0 +1,112 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +#define SVD_FUNC_DEF( FUNC ) \ + template svd_func_def svd_func(); + +#define SVD_FUNC( FUNC, T, Tr, PREFIX ) \ + template<> svd_func_def svd_func() \ + { return & LAPACK_NAME(PREFIX##FUNC); } + +#if defined(USE_MKL) || defined(__APPLE__) + + template + using svd_func_def = int (*)(ORDER_TYPE, + char jobz, + int m, int n, + T* in, int ldin, + Tr* s, + T* u, int ldu, + T* vt, int ldvt); + + SVD_FUNC_DEF( gesdd ) + SVD_FUNC(gesdd, float , float , s) + SVD_FUNC(gesdd, double , double, d) + SVD_FUNC(gesdd, cfloat , float , c) + SVD_FUNC(gesdd, cdouble, double, z) + +#else // Atlas causes memory freeing issues with using gesdd + + template + using svd_func_def = int (*)(ORDER_TYPE, + char jobu, char jobvt, + int m, int n, + T* in, int ldin, + Tr* s, + T* u, int ldu, + T* vt, int ldvt, + Tr *superb); + + SVD_FUNC_DEF( gesvd ) + SVD_FUNC(gesvd, float , float , s) + SVD_FUNC(gesvd, double , double, d) + SVD_FUNC(gesvd, cfloat , float , c) + SVD_FUNC(gesvd, cdouble, double, z) + +#endif + + template + void svdInPlace(Array &s, Array &u, Array &vt, Array &in) + { + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + std::shared_ptr sPtr = s.getMappedPtr(); + std::shared_ptr uPtr = u.getMappedPtr(); + std::shared_ptr vPtr = vt.getMappedPtr(); + std::shared_ptr iPtr = in.getMappedPtr(); + +#if defined(USE_MKL) || defined(__APPLE__) + svd_func()(AF_LAPACK_COL_MAJOR, 'A', + M, N, + iPtr.get(), in.strides()[1], + sPtr.get(), + uPtr.get(), u.strides()[1], + vPtr.get(), vt.strides()[1]); +#else + std::vector superb(std::min(M, N)); + svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', + M, N, + iPtr.get(), in.strides()[1], + sPtr.get(), + uPtr.get(), u.strides()[1], + vPtr.get(), vt.strides()[1], + &superb[0]); +#endif + } + + template + void svd(Array &s, Array &u, Array &vt, const Array &in) + { + Array in_copy = copyArray(in); + svdInPlace(s, u, vt, in_copy); + } + +#define INSTANTIATE_SVD(T, Tr) \ + template void svd(Array & s, Array & u, Array & vt, const Array &in); \ + template void svdInPlace(Array & s, Array & u, Array & vt, Array &in); + + INSTANTIATE_SVD(float , float ) + INSTANTIATE_SVD(double , double) + INSTANTIATE_SVD(cfloat , float ) + INSTANTIATE_SVD(cdouble, double) +} +} +#endif diff --git a/src/backend/opencl/cpu/cpu_svd.hpp b/src/backend/opencl/cpu/cpu_svd.hpp new file mode 100644 index 0000000000..4f271af8b9 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_svd.hpp @@ -0,0 +1,22 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + void svd(Array &s, Array &u, Array &vt, const Array &in); + + template + void svdInPlace(Array &s, Array &u, Array &vt, Array &in); +} +} diff --git a/src/backend/opencl/cpu/cpu_triangle.hpp b/src/backend/opencl/cpu/cpu_triangle.hpp new file mode 100644 index 0000000000..e705420582 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_triangle.hpp @@ -0,0 +1,57 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#ifndef CPU_LAPACK_TRIANGLE +#define CPU_LAPACK_TRIANGLE + +#include + +namespace opencl +{ +namespace cpu +{ + +template +void triangle(T *o, const T *i, const dim4 odm, const dim4 ost, const dim4 ist) +{ + for(dim_t ow = 0; ow < odm[3]; ow++) { + const dim_t oW = ow * ost[3]; + const dim_t iW = ow * ist[3]; + + for(dim_t oz = 0; oz < odm[2]; oz++) { + const dim_t oZW = oW + oz * ost[2]; + const dim_t iZW = iW + oz * ist[2]; + + for(dim_t oy = 0; oy < odm[1]; oy++) { + const dim_t oYZW = oZW + oy * ost[1]; + const dim_t iYZW = iZW + oy * ist[1]; + + for(dim_t ox = 0; ox < odm[0]; ox++) { + const dim_t oMem = oYZW + ox; + const dim_t iMem = iYZW + ox; + + bool cond = is_upper ? (oy >= ox) : (oy <= ox); + bool do_unit_diag = (is_unit_diag && ox == oy); + if(cond) { + o[oMem] = do_unit_diag ? scalar(1) : i[iMem]; + } else { + o[oMem] = scalar(0); + } + } + } + } + } +} + +} +} + +#endif +#endif diff --git a/src/backend/opencl/debug_opencl.hpp b/src/backend/opencl/debug_opencl.hpp index 74b3f7cf59..b4126f9abe 100644 --- a/src/backend/opencl/debug_opencl.hpp +++ b/src/backend/opencl/debug_opencl.hpp @@ -16,5 +16,10 @@ #include #define CL_DEBUG_FINISH(Q) Q.finish() #else -#define CL_DEBUG_FINISH(Q) +#define CL_DEBUG_FINISH(Q) \ + do { \ + if(synchronize_calls()) { \ + Q.finish(); \ + } \ + } while (false); #endif diff --git a/src/backend/opencl/err_opencl.hpp b/src/backend/opencl/err_opencl.hpp index 15855f3b08..955275203a 100644 --- a/src/backend/opencl/err_opencl.hpp +++ b/src/backend/opencl/err_opencl.hpp @@ -23,8 +23,8 @@ char opencl_err_msg[1024]; \ snprintf(opencl_err_msg, \ sizeof(opencl_err_msg), \ - "OpenCL Error: %s when calling %s", \ - getErrorMessage(ERR.err()).c_str(), \ + "OpenCL Error (%d): %s when calling %s", \ + ERR.err(), getErrorMessage(ERR.err()).c_str(), \ ERR.what()); \ if (ERR.err() == CL_MEM_OBJECT_ALLOCATION_FAILURE) { \ AF_ERROR(opencl_err_msg, AF_ERR_NO_MEM); \ diff --git a/src/backend/opencl/inverse.cpp b/src/backend/opencl/inverse.cpp index eb8348edd4..df955547ba 100644 --- a/src/backend/opencl/inverse.cpp +++ b/src/backend/opencl/inverse.cpp @@ -12,6 +12,8 @@ #include #if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include namespace opencl { @@ -19,6 +21,10 @@ namespace opencl template Array inverse(const Array &in) { + if(OpenCLCPUOffload()) { + if (in.dims()[0] == in.dims()[1]) + return cpu::inverse(in); + } Array I = identity(in.dims()); return solve(in, I); } diff --git a/src/backend/opencl/jit.cpp b/src/backend/opencl/jit.cpp index 66c7c1e9f7..d6ab240fd6 100644 --- a/src/backend/opencl/jit.cpp +++ b/src/backend/opencl/jit.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace opencl { @@ -180,13 +181,16 @@ void evalNodes(Param &out, Node *node) uint groups_1 = 1; uint num_odims = 4; + // CPUs seem to perform better with work group size 1024 + const int work_group_size = (getActiveDeviceType() == AFCL_DEVICE_TYPE_CPU) ? 1024 : 256; + while (num_odims >= 1) { if (out.info.dims[num_odims - 1] == 1) num_odims--; else break; } if (is_linear) { - local_0 = 256; + local_0 = work_group_size; uint out_elements = out.info.dims[3] * out.info.strides[3]; uint groups = divup(out_elements, local_0); @@ -194,8 +198,8 @@ void evalNodes(Param &out, Node *node) global_0 = divup(groups, global_1) * local_0; } else { - local_0 = 64; local_1 = 4; + local_0 = work_group_size / local_1; groups_0 = divup(out.info.dims[0], local_0); groups_1 = divup(out.info.dims[1], local_1); diff --git a/src/backend/opencl/kernel/convolve.hpp b/src/backend/opencl/kernel/convolve.hpp index 035f4c23aa..6d1d7de7ee 100644 --- a/src/backend/opencl/kernel/convolve.hpp +++ b/src/backend/opencl/kernel/convolve.hpp @@ -52,6 +52,7 @@ void convolve_nd(Param out, const Param signal, const Param filter, ConvolveBatc case 3: conv3(param, out, signal, filter); break; } + CL_DEBUG_FINISH(getQueue()); bufferFree(param.impulse); } diff --git a/src/backend/opencl/kernel/ireduce.hpp b/src/backend/opencl/kernel/ireduce.hpp index 0adc0c8e47..17fc460970 100644 --- a/src/backend/opencl/kernel/ireduce.hpp +++ b/src/backend/opencl/kernel/ireduce.hpp @@ -281,6 +281,14 @@ namespace kernel } } +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-function" +#else + /* Other */ +#endif + template double cabs(const T in) { return (double)in; } static double cabs(const cfloat in) { return (double)abs(in); } static double cabs(const cdouble in) { return (double)abs(in); } @@ -327,6 +335,12 @@ namespace kernel } }; +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic pop +#else + /* Other */ +#endif template T ireduce_all(uint *loc, Param in) diff --git a/src/backend/opencl/kernel/jit.cl b/src/backend/opencl/kernel/jit.cl index b34bbcddd8..3092449418 100644 --- a/src/backend/opencl/kernel/jit.cl +++ b/src/backend/opencl/kernel/jit.cl @@ -8,6 +8,7 @@ ********************************************************/ #define sign(in) signbit((in)) +#define __noop(a) (a) #define __add(lhs, rhs) (lhs) + (rhs) #define __sub(lhs, rhs) (lhs) - (rhs) #define __mul(lhs, rhs) (lhs) * (rhs) diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp index 871370d63b..69c1176210 100644 --- a/src/backend/opencl/kernel/orb.hpp +++ b/src/backend/opencl/kernel/orb.hpp @@ -29,8 +29,24 @@ using cl::LocalSpaceArg; using cl::NDRange; using std::vector; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#if defined(__clang__) + /* Clang/LLVM */ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wsometimes-uninitialized" +#elif defined(__ICC) || defined(__INTEL_COMPILER) + /* Intel ICC/ICPC */ + // Fix the warning code here, if any +#elif defined(__GNUC__) || defined(__GNUG__) + /* GNU GCC/G++ */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#elif defined(_MSC_VER) + /* Microsoft Visual Studio */ + #pragma warning( push ) + #pragma warning( disable : 4700 ) +#else + /* Other */ +#endif namespace opencl { @@ -505,4 +521,19 @@ void orb(unsigned* out_feat, } //namespace kernel } //namespace opencl -#pragma GCC diagnostic pop + +#if defined(__clang__) + /* Clang/LLVM */ + #pragma clang diagnostic pop +#elif defined(__ICC) || defined(__INTEL_COMPILER) + /* Intel ICC/ICPC */ + // Fix the warning code here, if any +#elif defined(__GNUC__) || defined(__GNUG__) + /* GNU GCC/G++ */ + #pragma GCC diagnostic pop +#elif defined(_MSC_VER) + /* Microsoft Visual Studio */ + #pragma warning( pop ) +#else + /* Other */ +#endif diff --git a/src/backend/opencl/kernel/select.cl b/src/backend/opencl/kernel/select.cl index 94a36031c3..03248be1b9 100644 --- a/src/backend/opencl/kernel/select.cl +++ b/src/backend/opencl/kernel/select.cl @@ -41,7 +41,7 @@ void select_kernel(__global T *optr, KParam oinfo, const int idw = get_group_id(1) / groups_1; const int group_id_0 = get_group_id(0) - idz * groups_0; - const int group_id_1 = get_group_id(1) - idz * groups_1; + const int group_id_1 = get_group_id(1) - idw * groups_1; const int idx = group_id_0 * get_local_size(0) + get_local_id(0); const int idy = group_id_1 * get_local_size(1) + get_local_id(1); @@ -80,7 +80,7 @@ void select_scalar_kernel(__global T *optr, KParam oinfo, const int idw = get_group_id(1) / groups_1; const int group_id_0 = get_group_id(0) - idz * groups_0; - const int group_id_1 = get_group_id(1) - idz * groups_1; + const int group_id_1 = get_group_id(1) - idw * groups_1; const int idx = group_id_0 * get_local_size(0) + get_local_id(0); const int idy = group_id_1 * get_local_size(1) + get_local_id(1); diff --git a/src/backend/opencl/kernel/transform.cl b/src/backend/opencl/kernel/transform.cl index 824f50cc5d..c44c18457a 100644 --- a/src/backend/opencl/kernel/transform.cl +++ b/src/backend/opencl/kernel/transform.cl @@ -11,8 +11,27 @@ #define BILINEAR transform_b #define LOWER transform_l -void calc_affine_inverse(float* txo, __global const float* txi) +void calc_transf_inverse(float* txo, __global const float* txi) { +#if PERSPECTIVE + txo[0] = txi[4]*txi[8] - txi[5]*txi[7]; + txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]); + txo[2] = txi[1]*txi[5] - txi[2]*txi[4]; + + txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]); + txo[4] = txi[0]*txi[8] - txi[2]*txi[6]; + txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]); + + txo[6] = txi[3]*txi[7] - txi[4]*txi[6]; + txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]); + txo[8] = txi[0]*txi[4] - txi[1]*txi[3]; + + float det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6]; + + txo[0] /= det; txo[1] /= det; txo[2] /= det; + txo[3] /= det; txo[4] /= det; txo[5] /= det; + txo[6] /= det; txo[7] /= det; txo[8] /= det; +#else float det = txi[0]*txi[4] - txi[1]*txi[3]; txo[0] = txi[4] / det; @@ -22,6 +41,7 @@ void calc_affine_inverse(float* txo, __global const float* txi) txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; +#endif } __kernel @@ -59,17 +79,17 @@ void transform_kernel(__global T *d_out, const KParam out, // Transform is in global memory. // Needs offset to correct transform being processed. - __global const float *tmat_ptr = c_tmat + t_idx * 6; - float tmat[6]; + __global const float *tmat_ptr = c_tmat + t_idx * TRANSF_LEN; + float tmat[TRANSF_LEN]; // We expect a inverse transform matrix by default // If it is an forward transform, then we need its inverse if(INVERSE == 1) { - #pragma unroll - for(int i = 0; i < 6; i++) + #pragma unroll 3 + for(int i = 0; i < TRANSF_LEN; i++) tmat[i] = tmat_ptr[i]; } else { - calc_affine_inverse(tmat, tmat_ptr); + calc_transf_inverse(tmat, tmat_ptr); } if (xido >= out.dims[0] && yido >= out.dims[1]) return; diff --git a/src/backend/opencl/kernel/transform.hpp b/src/backend/opencl/kernel/transform.hpp index 677acc31fe..f78c7b0ebe 100644 --- a/src/backend/opencl/kernel/transform.hpp +++ b/src/backend/opencl/kernel/transform.hpp @@ -50,7 +50,7 @@ namespace opencl >::type; - template + template void transform(Param out, const Param in, const Param tf) { try { @@ -64,11 +64,13 @@ namespace opencl std::call_once( compileFlags[device], [device] () { ToNum toNum; std::ostringstream options; - options << " -D T=" << dtype_traits::getName() - << " -D INVERSE=" << (isInverse ? 1 : 0) - << " -D ZERO=" << toNum(scalar(0)); - options << " -D VT=" << dtype_traits>::getName(); - options << " -D WT=" << dtype_traits>::getName(); + options << " -D T=" << dtype_traits::getName() + << " -D INVERSE=" << (isInverse ? 1 : 0) + << " -D PERSPECTIVE=" << (isPerspective ? 1 : 0) + << " -D TRANSF_LEN=" << (isPerspective ? 9 : 6) + << " -D ZERO=" << toNum(scalar(0)); + options << " -D VT=" << dtype_traits>::getName(); + options << " -D WT=" << dtype_traits>::getName(); if((af_dtype) dtype_traits::af_type == c32 || (af_dtype) dtype_traits::af_type == c64) { diff --git a/src/backend/opencl/kernel/transform_interp.cl b/src/backend/opencl/kernel/transform_interp.cl index 1d82951b9d..a083df0ff6 100644 --- a/src/backend/opencl/kernel/transform_interp.cl +++ b/src/backend/opencl/kernel/transform_interp.cl @@ -25,12 +25,23 @@ void transform_n(__global T *d_out, const KParam out, __global const T *d_in, co const float *tmat, const int xido, const int yido, const int nimages) { // Compute input index - const int xidi = round(xido * tmat[0] - + yido * tmat[1] - + tmat[2]); - const int yidi = round(xido * tmat[3] - + yido * tmat[4] - + tmat[5]); + int xidi = 0, yidi = 0; +#if PERSPECTIVE + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = round((xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W); + yidi = round((xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W); +#else + xidi = round(xido * tmat[0] + + yido * tmat[1] + + tmat[2]); + yidi = round(xido * tmat[3] + + yido * tmat[4] + + tmat[5]); +#endif // Compute memory location of indices const int loci = yidi * in.strides[1] + xidi; @@ -54,12 +65,23 @@ void transform_b(__global T *d_out, const KParam out, __global const T *d_in, co const int loco = (yido * out.strides[1] + xido); // Compute input index - const float xid = xido * tmat[0] - + yido * tmat[1] - + tmat[2]; - const float yid = xido * tmat[3] - + yido * tmat[4] - + tmat[5]; + float xid = 0.0f, yid = 0.0f; +#if PERSPECTIVE + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xid = (xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W; + yid = (xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W; +#else + xid = xido * tmat[0] + + yido * tmat[1] + + tmat[2]; + yid = xido * tmat[3] + + yido * tmat[4] + + tmat[5]; +#endif T zero = ZERO; if (xid < -0.001 || yid < -0.001 || in.dims[0] < xid || in.dims[1] < yid) { @@ -104,12 +126,23 @@ void transform_l(__global T *d_out, const KParam out, __global const T *d_in, co const float *tmat, const int xido, const int yido, const int nimages) { // Compute input index - const int xidi = floor(xido * tmat[0] - + yido * tmat[1] - + tmat[2]); - const int yidi = floor(xido * tmat[3] - + yido * tmat[4] - + tmat[5]); + int xidi = 0, yidi = 0; +#if PERSPECTIVE + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = floor((xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W); + yidi = floor((xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W); +#else + xidi = floor(xido * tmat[0] + + yido * tmat[1] + + tmat[2]); + yidi = floor(xido * tmat[3] + + yido * tmat[4] + + tmat[5]); +#endif // Compute memory location of indices const int loci = yidi * in.strides[1] + xidi; diff --git a/src/backend/opencl/kernel/where.hpp b/src/backend/opencl/kernel/where.hpp index 2cbf8c1019..2b1308fcec 100644 --- a/src/backend/opencl/kernel/where.hpp +++ b/src/backend/opencl/kernel/where.hpp @@ -159,7 +159,9 @@ namespace kernel out.info.strides[k] = total; } - get_out_idx(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y); + if (total > 0) { + get_out_idx(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y); + } bufferFree(rtmp.data); bufferFree(otmp.data); diff --git a/src/backend/opencl/lu.cpp b/src/backend/opencl/lu.cpp index ee76f47201..0bc6bd5283 100644 --- a/src/backend/opencl/lu.cpp +++ b/src/backend/opencl/lu.cpp @@ -14,7 +14,9 @@ #include #include #include +#include #include +#include namespace opencl { @@ -41,8 +43,11 @@ Array convertPivot(int *ipiv, int in_sz, int out_sz) template void lu(Array &lower, Array &upper, Array &pivot, const Array &in) { - try { + if(OpenCLCPUOffload()) { + return cpu::lu(lower, upper, pivot, in); + } + dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; @@ -67,6 +72,10 @@ template Array lu_inplace(Array &in, const bool convert_pivot) { try { + if(OpenCLCPUOffload()) { + return cpu::lu_inplace(in, convert_pivot); + } + initBlas(); dim4 iDims = in.dims(); int M = iDims[0]; @@ -88,6 +97,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) } } +bool isLAPACKAvailable() +{ + return true; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -116,6 +130,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) AF_ERROR("Linear Algebra is disabled on OpenCL", AF_ERR_NOT_CONFIGURED); } +bool isLAPACKAvailable() +{ + return false; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); diff --git a/src/backend/opencl/lu.hpp b/src/backend/opencl/lu.hpp index af43f24614..b44eca8c60 100644 --- a/src/backend/opencl/lu.hpp +++ b/src/backend/opencl/lu.hpp @@ -17,4 +17,6 @@ namespace opencl template Array lu_inplace(Array &in, const bool convert_pivot = true); + + bool isLAPACKAvailable(); } diff --git a/src/backend/opencl/magma/getrs.cpp b/src/backend/opencl/magma/getrs.cpp index 1dc106c0c5..eb28a5175a 100644 --- a/src/backend/opencl/magma/getrs.cpp +++ b/src/backend/opencl/magma/getrs.cpp @@ -61,6 +61,7 @@ #include #include #include +#include template magma_int_t magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, @@ -168,8 +169,7 @@ magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, clblasTranspose cltrans =(trans == MagmaNoTrans) ? clblasNoTrans : (trans == MagmaTrans ? clblasTrans : clblasConjTrans); - std::string pName = opencl::getPlatformName(opencl::getDevice()); - bool cond = pName.find("NVIDIA") != std::string::npos; + bool cond = opencl::getActivePlatform() == AFCL_PLATFORM_NVIDIA; cl_mem dAT = 0; if (nrhs > 1 && cond) { magma_malloc(&dAT, n * n); diff --git a/src/backend/opencl/magma/magma_cpu_blas.h b/src/backend/opencl/magma/magma_cpu_blas.h index b3cba096b5..6661aad657 100644 --- a/src/backend/opencl/magma/magma_cpu_blas.h +++ b/src/backend/opencl/magma/magma_cpu_blas.h @@ -13,16 +13,16 @@ #include #include "magma_types.h" -#ifdef __APPLE__ -#include -#else #ifdef USE_MKL -#include + #include #else -extern "C" { -#include -} -#endif + #ifdef __APPLE__ + #include + #else + extern "C" { + #include + } + #endif #endif // Todo: Ask upstream for a more official way to detect it diff --git a/src/backend/opencl/magma/magma_cpu_lapack.h b/src/backend/opencl/magma/magma_cpu_lapack.h index 5974dab8a9..54c26ae0e9 100644 --- a/src/backend/opencl/magma/magma_cpu_lapack.h +++ b/src/backend/opencl/magma/magma_cpu_lapack.h @@ -39,16 +39,20 @@ int LAPACKE_dlacgv_work(Args... args) { return 0; } #define ORDER_TYPE int #define LAPACK_NAME(fn) LAPACKE_##fn -#if defined(__APPLE__) - #define LAPACK_COL_MAJOR 102 - #include "../../lapacke.hpp" +#ifdef USE_MKL + #include #else - #ifdef USE_MKL - #include + #ifdef __APPLE__ + #include + #include + #undef LAPACK_COL_MAJOR + #define LAPACK_COL_MAJOR 102 + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 #else // NETLIB LAPACKE #include - #endif // MKL/NETLIB -#endif //APPLE + #endif +#endif #define LAPACKE_CHECK(fn) do { \ int __info = fn; \ diff --git a/src/backend/opencl/magma/magma_helper.cpp b/src/backend/opencl/magma/magma_helper.cpp index 584a412191..481f08c346 100644 --- a/src/backend/opencl/magma/magma_helper.cpp +++ b/src/backend/opencl/magma/magma_helper.cpp @@ -159,6 +159,14 @@ magma_int_t magma_get_geqrf_nb( magma_int_t m ) else return 128; } +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wmissing-braces" +#else + /* Other */ +#endif + template T magma_make(double r, double i) { return (T) r; } template float magma_make(double r, double i); template double magma_make(double r, double i); @@ -172,3 +180,10 @@ template<> magmaDoubleComplex magma_make(double r, double i) magmaDoubleComplex tmp = {r, i}; return tmp; } + +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic pop +#else + /* Other */ +#endif diff --git a/src/backend/opencl/magma/potrf.cpp b/src/backend/opencl/magma/potrf.cpp index d048ed4dac..4f9984f325 100644 --- a/src/backend/opencl/magma/potrf.cpp +++ b/src/backend/opencl/magma/potrf.cpp @@ -199,7 +199,7 @@ magma_int_t magma_potrf_gpu( magma_getmatrix_async(jb, jb, dA(j,j), ldda, work, jb, queue, &event); // apply all previous updates to block row right of diagonal block - if (j+jb < n) { + if (j+jb < n && j > 0) { CLBLAS_CHECK(gpu_blas_gemm( transType, clblasNoTrans, jb, n-j-jb, j, @@ -259,7 +259,7 @@ magma_int_t magma_potrf_gpu( magma_getmatrix_async(jb, jb, dA(j,j), ldda, work, jb, queue, &event); // apply all previous updates to block column below diagonal block - if (j+jb < n) { + if (j+jb < n && j > 0) { CLBLAS_CHECK(gpu_blas_gemm( clblasNoTrans, transType, n-j-jb, jb, j, diff --git a/src/backend/opencl/math.hpp b/src/backend/opencl/math.hpp index 9292d398a0..f090062b03 100644 --- a/src/backend/opencl/math.hpp +++ b/src/backend/opencl/math.hpp @@ -17,6 +17,14 @@ #include "backend.hpp" #include "types.hpp" +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-function" +#else + /* Other */ +#endif + namespace opencl { @@ -123,3 +131,10 @@ namespace opencl cfloat operator *(cfloat a, cfloat b); cdouble operator *(cdouble a, cdouble b); } + +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic pop +#else + /* Other */ +#endif diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index f4c740482e..5df64d6d86 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -10,344 +10,265 @@ #include #include #include +#include +#include +#include #include +#include -namespace opencl -{ - static size_t memory_resolution = 1024; //1KB - - void setMemStepSize(size_t step_bytes) - { - memory_resolution = step_bytes; - } +#include - size_t getMemStepSize(void) - { - return memory_resolution; - } - - // Manager Class - // Dummy used to call garbage collection at the end of the program - class Manager - { - public: - static bool initialized; - Manager() - { - initialized = true; - } +#ifndef AF_MEM_DEBUG +#define AF_MEM_DEBUG 0 +#endif - ~Manager() - { - for(int i = 0; i < (int)getDeviceCount(); i++) { - setDevice(i); - garbageCollect(); - pinnedGarbageCollect(); - } - } - }; +#ifndef AF_OPENCL_MEM_DEBUG +#define AF_OPENCL_MEM_DEBUG 0 +#endif - bool Manager::initialized = false; +namespace opencl +{ - static void managerInit() +class MemoryManager : public common::MemoryManager +{ + int getActiveDeviceId(); + size_t getMaxMemorySize(int id); +public: + MemoryManager(); + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); + ~MemoryManager() { - if(Manager::initialized == false) - static Manager pm = Manager(); + common::lock_guard_t lock(this->memory_mutex); + for (int n = 0; n < getDeviceCount(); n++) { + opencl::setDevice(n); + this->garbageCollect(); + } } +}; - typedef struct - { - bool is_free; - bool is_unlinked; - size_t bytes; - } mem_info; +class MemoryManagerPinned : public common::MemoryManager +{ + std::vector< + std::map + > pinned_maps; + int getActiveDeviceId(); + size_t getMaxMemorySize(int id); - static size_t used_bytes[DeviceManager::MAX_DEVICES] = {0}; - static size_t used_buffers[DeviceManager::MAX_DEVICES] = {0}; - static size_t total_bytes[DeviceManager::MAX_DEVICES] = {0}; +public: - typedef std::map mem_t; - typedef mem_t::iterator mem_iter; - mem_t memory_maps[DeviceManager::MAX_DEVICES]; + MemoryManagerPinned(); - static void destroy(cl::Buffer *ptr) - { - delete ptr; - } + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); - void garbageCollect() + ~MemoryManagerPinned() { - int n = getActiveDeviceId(); - for(mem_iter iter = memory_maps[n].begin(); - iter != memory_maps[n].end(); ++iter) { - - if ((iter->second).is_free) { - - if (!(iter->second).is_unlinked) { - destroy(iter->first); - total_bytes[n] -= iter->second.bytes; - } - } - } - - mem_iter memory_curr = memory_maps[n].begin(); - mem_iter memory_end = memory_maps[n].end(); - - while(memory_curr != memory_end) { - if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) { - memory_curr = memory_maps[n].erase(memory_curr); - } else { - ++memory_curr; + common::lock_guard_t lock(this->memory_mutex); + for (int n = 0; n < getDeviceCount(); n++) { + opencl::setDevice(n); + this->garbageCollect(); + auto pinned_curr_iter = pinned_maps[n].begin(); + auto pinned_end_iter = pinned_maps[n].end(); + while (pinned_curr_iter != pinned_end_iter) { + pinned_maps[n].erase(pinned_curr_iter++); } } } +}; - cl::Buffer *bufferAlloc(const size_t &bytes) - { - int n = getActiveDeviceId(); - cl::Buffer *ptr = NULL; - size_t alloc_bytes = divup(bytes, memory_resolution) * memory_resolution; - - if (bytes > 0) { - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes[n] >= MAX_BYTES) { - garbageCollect(); - } - - for(mem_iter iter = memory_maps[n].begin(); - iter != memory_maps[n].end(); ++iter) { - - mem_info info = iter->second; - - if ( info.is_free && - !info.is_unlinked && - info.bytes == alloc_bytes) { +int MemoryManager::getActiveDeviceId() +{ + return opencl::getActiveDeviceId(); +} - iter->second.is_free = false; - used_bytes[n] += alloc_bytes; - used_buffers[n]++; - return iter->first; - } - } +size_t MemoryManager::getMaxMemorySize(int id) +{ + return opencl::getDeviceMemorySize(id); +} - try { - ptr = new cl::Buffer(getContext(), CL_MEM_READ_WRITE, alloc_bytes); - } catch(...) { - garbageCollect(); - ptr = new cl::Buffer(getContext(), CL_MEM_READ_WRITE, alloc_bytes); - } +MemoryManager::MemoryManager() : + common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG) +{ + this->setMaxMemorySize(); +} - mem_info info = {false, false, alloc_bytes}; - memory_maps[n][ptr] = info; - used_bytes[n] += alloc_bytes; - used_buffers[n]++; - total_bytes[n] += alloc_bytes; - } - return ptr; +void *MemoryManager::nativeAlloc(const size_t bytes) +{ + try { + return (void *)(new cl::Buffer(getContext(), CL_MEM_READ_WRITE, bytes)); + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); } +} - void bufferFree(cl::Buffer *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find(ptr); - - if (iter != memory_maps[n].end()) { - - iter->second.is_free = true; - if ((iter->second).is_unlinked) return; - - used_bytes[n] -= iter->second.bytes; - used_buffers[n]--; - } else { - destroy(ptr); // Free it because we are not sure what the size is - } +void MemoryManager::nativeFree(void *ptr) +{ + try { + delete (cl::Buffer *)ptr; + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); } +} - void bufferPop(cl::Buffer *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find(ptr); - - if (iter != memory_maps[n].end()) { - iter->second.is_unlinked = true; - } else { - - mem_info info = { false, - false, - 100 }; //This number is not relevant +static MemoryManager &getMemoryManager() +{ + static MemoryManager instance; + return instance; +} - memory_maps[n][ptr] = info; - } - } +int MemoryManagerPinned::getActiveDeviceId() +{ + return opencl::getActiveDeviceId(); +} - void bufferPush(cl::Buffer *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find(ptr); +size_t MemoryManagerPinned::getMaxMemorySize(int id) +{ + return opencl::getDeviceMemorySize(id); +} - if (iter != memory_maps[n].end()) { - iter->second.is_unlinked = false; - } - } +MemoryManagerPinned::MemoryManagerPinned() : + common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG), + pinned_maps(getDeviceCount()) +{ + this->setMaxMemorySize(); +} - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) - { - int n = getActiveDeviceId(); - if (alloc_bytes ) *alloc_bytes = total_bytes[n]; - if (alloc_buffers ) *alloc_buffers = memory_maps[n].size(); - if (lock_bytes ) *lock_bytes = used_bytes[n]; - if (lock_buffers ) *lock_buffers = used_buffers[n]; +void *MemoryManagerPinned::nativeAlloc(const size_t bytes) +{ + void *ptr = NULL; + try { + cl::Buffer buf= cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, bytes); + ptr = getQueue().enqueueMapBuffer(buf, true, CL_MAP_READ | CL_MAP_WRITE, 0, bytes); + pinned_maps[opencl::getActiveDeviceId()][ptr] = buf; + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); } + return ptr; +} - template - T *memAlloc(const size_t &elements) - { - managerInit(); - return (T *)bufferAlloc(elements * sizeof(T)); - } +void MemoryManagerPinned::nativeFree(void *ptr) +{ + try { + int n = opencl::getActiveDeviceId(); + auto iter = pinned_maps[n].find(ptr); - template - void memFree(T *ptr) - { - return bufferFree((cl::Buffer *)ptr); - } + if (iter != pinned_maps[n].end()) { + getQueue().enqueueUnmapMemObject(pinned_maps[n][ptr], ptr); + pinned_maps[n].erase(iter); + } - template - void memPop(const T *ptr) - { - return bufferPop((cl::Buffer *)ptr); + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); } +} - template - void memPush(const T *ptr) - { - return bufferPush((cl::Buffer *)ptr); - } +static MemoryManagerPinned &getMemoryManagerPinned() +{ + static MemoryManagerPinned instance; + return instance; +} - // pinned memory manager - typedef struct { - cl::Buffer *buf; - mem_info info; - } pinned_info; +void setMemStepSize(size_t step_bytes) +{ + getMemoryManager().setMemStepSize(step_bytes); +} - typedef std::map pinned_t; - typedef pinned_t::iterator pinned_iter; - pinned_t pinned_maps[DeviceManager::MAX_DEVICES]; - static size_t pinned_used_bytes = 0; +size_t getMemStepSize(void) +{ + return getMemoryManager().getMemStepSize(); +} - static void pinnedDestroy(cl::Buffer *buf, void *ptr) - { - getQueue().enqueueUnmapMemObject(*buf, (void *)ptr); - destroy(buf); - } +size_t getMaxBytes() +{ + return getMemoryManager().getMaxBytes(); +} - void pinnedGarbageCollect() - { - int n = getActiveDeviceId(); - for(auto &iter : pinned_maps[n]) { - if ((iter.second).info.is_free) { - pinnedDestroy(iter.second.buf, iter.first); - } - } +unsigned getMaxBuffers() +{ + return getMemoryManager().getMaxBuffers(); +} - pinned_iter memory_curr = pinned_maps[n].begin(); - pinned_iter memory_end = pinned_maps[n].end(); +void garbageCollect() +{ + getMemoryManager().garbageCollect(); +} - while(memory_curr != memory_end) { - if (memory_curr->second.info.is_free) { - memory_curr = pinned_maps[n].erase(memory_curr); - } else { - ++memory_curr; - } - } +void printMemInfo(const char *msg, const int device) +{ + getMemoryManager().printInfo(msg, device); +} - } +template +T* memAlloc(const size_t &elements) +{ + return (T *)getMemoryManager().alloc(elements * sizeof(T), false); +} - void *pinnedBufferAlloc(const size_t &bytes) - { - void *ptr = NULL; - int n = getActiveDeviceId(); - // Allocate the higher megabyte. Overhead of creating pinned memory is - // more so we want more resuable memory. - size_t alloc_bytes = divup(bytes, 1048576) * 1048576; - - if (bytes > 0) { - cl::Buffer *buf = NULL; - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (pinned_maps[n].size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) { - pinnedGarbageCollect(); - } +void* memAllocUser(const size_t &bytes) +{ + return getMemoryManager().alloc(bytes, true); +} +template +void memFree(T *ptr) +{ + return getMemoryManager().unlock((void *)ptr, false); +} - for(pinned_iter iter = pinned_maps[n].begin(); - iter != pinned_maps[n].end(); ++iter) { +void memFreeUser(void *ptr) +{ + getMemoryManager().unlock((void *)ptr, true); +} - mem_info info = iter->second.info; - if (info.is_free && info.bytes == alloc_bytes) { - iter->second.info.is_free = false; - pinned_used_bytes += alloc_bytes; - return iter->first; - } - } +cl::Buffer *bufferAlloc(const size_t &bytes) +{ + return (cl::Buffer *)getMemoryManager().alloc(bytes, false); +} - try { - buf = new cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, alloc_bytes); +void bufferFree(cl::Buffer *buf) +{ + return getMemoryManager().unlock((void *)buf, false); +} - ptr = getQueue().enqueueMapBuffer(*buf, true, CL_MAP_READ|CL_MAP_WRITE, - 0, alloc_bytes); - } catch(...) { - pinnedGarbageCollect(); - buf = new cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, alloc_bytes); +void memLock(const void *ptr) +{ + getMemoryManager().userLock((void *)ptr); +} - ptr = getQueue().enqueueMapBuffer(*buf, true, CL_MAP_READ|CL_MAP_WRITE, - 0, alloc_bytes); - } - mem_info info = {false, false, alloc_bytes}; - pinned_info pt = {buf, info}; - pinned_maps[n][ptr] = pt; - pinned_used_bytes += alloc_bytes; - } - return ptr; - } +void memUnlock(const void *ptr) +{ + getMemoryManager().userUnlock((void *)ptr); +} - void pinnedBufferFree(void *ptr) - { - int n = getActiveDeviceId(); - pinned_iter iter = pinned_maps[n].find(ptr); +void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers, + lock_bytes, lock_buffers); +} - if (iter != pinned_maps[n].end()) { - iter->second.info.is_free = true; - pinned_used_bytes -= iter->second.info.bytes; - } else { - pinnedDestroy(iter->second.buf, ptr); // Free it because we are not sure what the size is - pinned_maps[n].erase(iter); - } - } +template +T* pinnedAlloc(const size_t &elements) +{ + return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T), false); +} - template - T* pinnedAlloc(const size_t &elements) - { - managerInit(); - return (T *)pinnedBufferAlloc(elements * sizeof(T)); - } +template +void pinnedFree(T* ptr) +{ + return getMemoryManagerPinned().unlock((void *)ptr, false); +} - template - void pinnedFree(T* ptr) - { - return pinnedBufferFree((void *) ptr); - } +bool checkMemoryLimit() +{ + return getMemoryManager().checkMemoryLimit(); +} -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ INSTANTIATE(float) INSTANTIATE(cfloat) diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index c315a9a2f6..a02d387591 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -17,22 +17,32 @@ namespace opencl cl::Buffer *bufferAlloc(const size_t &bytes); void bufferFree(cl::Buffer *buf); - template T *memAlloc(const size_t &elements); - template void memFree(T *ptr); - template void memPop(const T *ptr); - template void memPush(const T *ptr); + template T* memAlloc(const size_t &elements); + void *memAllocUser(const size_t &bytes); + + // Need these as 2 separate function and not a default argument + // This is because it is used as the deleter in shared pointer + // which cannot support default arguments + template void memFree(T* ptr); + void memFreeUser(void* ptr); + + void memLock(const void *ptr); + void memUnlock(const void *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 100; - static const unsigned MAX_BYTES = (1 << 30); + size_t getMaxBytes(); + unsigned getMaxBuffers(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); void garbageCollect(); void pinnedGarbageCollect(); + void printMemInfo(const char *msg, const int device); + void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); + bool checkMemoryLimit(); } diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 16fb3e0d34..dc8ab4ea65 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,8 @@ #include #include #include +#include +#include using std::string; using std::vector; @@ -99,7 +102,6 @@ DeviceManager::~DeviceManager() for (auto q: mQueues) delete q; for (auto d : mDevices) delete d; for (auto c : mContexts) delete c; - for (auto p : mPlatforms) delete p; #endif } @@ -109,56 +111,182 @@ void DeviceManager::setContext(int device) mActiveCtxId = device; } +static inline bool verify_present(std::string pname, const char *ref) +{ + return pname.find(ref) != std::string::npos; +} + +static inline bool compare_default(const Device *ldev, const Device *rdev) +{ + const cl_device_type device_types[] = {CL_DEVICE_TYPE_GPU, + CL_DEVICE_TYPE_ACCELERATOR}; + + auto l_dev_type = ldev->getInfo(); + auto r_dev_type = rdev->getInfo(); + + // This ensures GPU > ACCELERATOR > CPU + for (auto current_type : device_types) { + auto is_l_curr_type = l_dev_type == current_type; + auto is_r_curr_type = r_dev_type == current_type; + + if ( is_l_curr_type && !is_r_curr_type) return true; + if (!is_l_curr_type && is_r_curr_type) return false; + } + + // For GPUs, this ensures discreet > integrated + auto is_l_integrared = ldev->getInfo(); + auto is_r_integrared = rdev->getInfo(); + + if (!is_l_integrared && is_r_integrared) return true; + if ( is_l_integrared && !is_r_integrared) return false; + + // At this point, the devices are of same type. + // Sort based on emperical evidence of preferred platforms + + // Prefer AMD first + std::string lPlatName = getPlatformName(*ldev); + std::string rPlatName = getPlatformName(*rdev); + + if (l_dev_type == CL_DEVICE_TYPE_GPU && + r_dev_type == CL_DEVICE_TYPE_GPU ) { + // If GPU, prefer AMD > NVIDIA > Beignet / Intel > APPLE + const char *platforms[] = {"AMD", "NVIDIA", "APPLE", "INTEL", "BEIGNET"}; + + for (auto ref_name : platforms) { + if ( verify_present(lPlatName, ref_name) && + !verify_present(rPlatName, ref_name)) return true; + + if (!verify_present(lPlatName, ref_name) && + verify_present(rPlatName, ref_name)) return false; + } + + // Intel falls back to compare based on memory + } else { + // If CPU, prefer Intel > AMD > POCL > APPLE + const char *platforms[] = {"INTEL", "AMD", "POCL", "APPLE"}; + + for (auto ref_name : platforms) { + if ( verify_present(lPlatName, ref_name) && + !verify_present(rPlatName, ref_name)) return true; + + if (!verify_present(lPlatName, ref_name) && + verify_present(rPlatName, ref_name)) return false; + } + } + + + // Compare device compute versions + + { + // Check Device OpenCL Version + auto lversion = ldev->getInfo(); + auto rversion = rdev->getInfo(); + + bool lres = (lversion[7] > rversion[7]) || + ((lversion[7] == rversion[7]) && (lversion[9] > rversion[9])); + + bool rres = (lversion[7] < rversion[7]) || + ((lversion[7] == rversion[7]) && (lversion[9] < rversion[9])); + + if (lres) return true; + if (rres) return false; + } + + // Default crietria, sort based on memory + // Sort based on memory + auto l_mem = ldev->getInfo(); + auto r_mem = rdev->getInfo(); + return l_mem >= r_mem; +} + +static afcl::deviceType getDeviceTypeEnum(cl::Device dev) +{ + return (afcl::deviceType)dev.getInfo(); +} + + +static afcl::platform getPlatformEnum(cl::Device dev) +{ + std::string pname = getPlatformName(dev); + if (verify_present(pname, "AMD")) return AFCL_PLATFORM_AMD; + if (verify_present(pname, "NVIDIA")) return AFCL_PLATFORM_NVIDIA; + if (verify_present(pname, "INTEL")) return AFCL_PLATFORM_INTEL; + if (verify_present(pname, "APPLE")) return AFCL_PLATFORM_APPLE; + if (verify_present(pname, "BEIGNET")) return AFCL_PLATFORM_BEIGNET; + if (verify_present(pname, "POCL")) return AFCL_PLATFORM_POCL; + return AFCL_PLATFORM_UNKNOWN; +} + + DeviceManager::DeviceManager() - : mActiveCtxId(0), mActiveQId(0) + : mUserDeviceOffset(0), mActiveCtxId(0), mActiveQId(0) { try { std::vector platforms; Platform::get(&platforms); - cl_device_type DEVC_TYPES[] = { - CL_DEVICE_TYPE_GPU, -#ifndef OS_MAC - CL_DEVICE_TYPE_ACCELERATOR, - CL_DEVICE_TYPE_CPU + // This is all we need because the sort takes care of the order of devices +#ifdef OS_MAC + cl_device_type DEVICE_TYPES = CL_DEVICE_TYPE_GPU; +#else + cl_device_type DEVICE_TYPES = CL_DEVICE_TYPE_ALL; #endif - }; - - for (auto &platform : platforms) - mPlatforms.push_back(new Platform(platform)); - - unsigned nDevices = 0; - for (auto devType : DEVC_TYPES) { - for (auto &platform : platforms) { - - cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, - (cl_context_properties)(platform()), - 0}; - - std::vector devs; - try { - platform.getDevices(devType, &devs); - } catch(const cl::Error &err) { - if (err.err() != CL_DEVICE_NOT_FOUND) { - throw; - } - } - for (auto dev : devs) { - nDevices++; - Context *ctx = new Context(dev, cps); - CommandQueue *cq = new CommandQueue(*ctx, dev); - mDevices.push_back(new Device(dev)); - mContexts.push_back(ctx); - mQueues.push_back(cq); - mCtxOffsets.push_back(nDevices); - mIsGLSharingOn.push_back(false); + std::string deviceENV = getEnvVar("AF_OPENCL_DEVICE_TYPE"); + + if (deviceENV.compare("GPU") == 0) { + DEVICE_TYPES = CL_DEVICE_TYPE_GPU; + } else if (deviceENV.compare("CPU") == 0) { + DEVICE_TYPES = CL_DEVICE_TYPE_CPU; + } else if (deviceENV.compare("ACC") >= 0) { + DEVICE_TYPES = CL_DEVICE_TYPE_ACCELERATOR; + } + + + + // Iterate through platforms, get all available devices and store them + for (auto &platform : platforms) { + std::vector current_devices; + + try { + platform.getDevices(DEVICE_TYPES, ¤t_devices); + } catch(const cl::Error &err) { + if (err.err() != CL_DEVICE_NOT_FOUND) { + throw; } } + + for (auto dev : current_devices) { + mDevices.push_back(new Device(dev)); + } + } + + int nDevices = mDevices.size(); + + if (nDevices == 0) AF_ERROR("No OpenCL devices found", AF_ERR_RUNTIME); + + // Sort OpenCL devices based on default criteria + std::stable_sort(mDevices.begin(), mDevices.end(), compare_default); + + // Create contexts and queues once the sort is done + for (int i = 0; i < nDevices; i++) { + cl_platform_id device_platform = mDevices[i]->getInfo(); + cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)(device_platform), + 0}; + + Context *ctx = new Context(*mDevices[i], cps); + CommandQueue *cq = new CommandQueue(*ctx, *mDevices[i]); + mContexts.push_back(ctx); + mQueues.push_back(cq); + mIsGLSharingOn.push_back(false); + mDeviceTypes.push_back(getDeviceTypeEnum(*mDevices[i])); + mPlatforms.push_back(getPlatformEnum(*mDevices[i])); } - const char* deviceENV = getenv("AF_OPENCL_DEFAULT_DEVICE"); - if(deviceENV) { + bool default_device_set = false; + deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE"); + if(!deviceENV.empty()) { std::stringstream s(deviceENV); int def_device = -1; s >> def_device; @@ -167,18 +295,48 @@ DeviceManager::DeviceManager() printf("Setting default device as 0\n"); } else { setContext(def_device); + default_device_set = true; } } + + deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE_TYPE"); + if (!default_device_set && !deviceENV.empty()) + { + cl_device_type default_device_type = CL_DEVICE_TYPE_GPU; + if (deviceENV.compare("CPU") == 0) { + default_device_type = CL_DEVICE_TYPE_CPU; + } else if (deviceENV.compare("ACC") >= 0) { + default_device_type = CL_DEVICE_TYPE_ACCELERATOR; + } + + bool default_device_set = false; + for (int i = 0; i < nDevices; i++) { + if (mDevices[i]->getInfo() == default_device_type) { + default_device_set = true; + setContext(i); + break; + } + } + + if (!default_device_set) { + printf("WARNING: AF_OPENCL_DEFAULT_DEVICE_TYPE=%s is not available\n", + deviceENV.c_str()); + printf("Using default device as 0\n"); + } + } + } catch (const cl::Error &error) { CL_TO_AF_ERROR(error); } - /* loop over devices and replace contexts with - * OpenGL shared contexts whereever applicable */ + + #if defined(WITH_GRAPHICS) // Define AF_DISABLE_GRAPHICS with any value to disable initialization - const char* noGraphicsENV = getenv("AF_DISABLE_GRAPHICS"); - if(!noGraphicsENV) { // If AF_DISABLE_GRAPHICS is not defined + std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS"); + if(noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined try { + /* loop over devices and replace contexts with + * OpenGL shared contexts whereever applicable */ int devCount = mDevices.size(); fg::Window* wHandle = graphics::ForgeManager::getInstance().getMainWindow(); for(int i=0; i strmap_t; static strmap_t platMap; if (isFirst) { - platMap["NVIDIA CUDA"] = "NVIDIA "; - platMap["Intel(R) OpenCL"] = "INTEL "; + platMap["NVIDIA CUDA"] = "NVIDIA "; + platMap["Intel(R) OpenCL"] = "INTEL "; platMap["AMD Accelerated Parallel Processing"] = "AMD "; - platMap["Intel Gen OCL Driver"] = "BEIGNET "; - platMap["Apple"] = "APPLE "; + platMap["Intel Gen OCL Driver"] = "BEIGNET "; + platMap["Apple"] = "APPLE "; + platMap["Portable Computing Language"] = "POCL "; isFirst = false; } @@ -223,45 +383,48 @@ static std::string platformMap(std::string &platStr) } } -std::string getInfo() +std::string getDeviceInfo() { ostringstream info; info << "ArrayFire v" << AF_VERSION << " (OpenCL, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; unsigned nDevices = 0; - for (auto context : DeviceManager::getInstance().mContexts) { - vector devices = context->getInfo(); + for(auto &device: DeviceManager::getInstance().mDevices) { + const Platform platform(device->getInfo()); - for(auto &device:devices) { - const Platform platform(device.getInfo()); + string dstr = device->getInfo(); - string platStr = platform.getInfo(); - string dstr = device.getInfo(); + // Remove null termination character from the strings + dstr.pop_back(); - // Remove null termination character from the strings - platStr.pop_back(); - dstr.pop_back(); + bool show_braces = ((unsigned)getActiveDeviceId() == nDevices); - bool show_braces = ((unsigned)getActiveDeviceId() == nDevices); - string id = (show_braces ? string("[") : "-") + std::to_string(nDevices) + - (show_braces ? string("]") : "-"); - info << id << " " << platformMap(platStr) << ": " << ltrim(dstr) << " "; + string id = + (show_braces ? string("[") : "-") + + std::to_string(nDevices) + + (show_braces ? string("]") : "-"); + + size_t msize = device->getInfo(); + info << id << " " << getPlatformName(*device) << ": " << ltrim(dstr) + << ", " << msize / 1048576 << " MB"; #ifndef NDEBUG - string devVersion = device.getInfo(); - string driVersion = device.getInfo(); - devVersion.pop_back(); - driVersion.pop_back(); - info << devVersion; - info << " Device driver " << driVersion; - info << " FP64 Support(" - << (device.getInfo()>0 ? "True" : "False") - << ")"; + info << " -- "; + string devVersion = device->getInfo(); + string driVersion = device->getInfo(); + devVersion.pop_back(); + driVersion.pop_back(); + info << devVersion; + info << " -- Device driver " << driVersion; + info << " -- FP64 Support: " + << (device->getInfo()>0 ? "True" : "False"); + info << " -- Unified Memory (" + << (isHostUnifiedMemory(*device) ? "True" : "False") + << ")"; #endif - info << std::endl; + info << std::endl; - nDevices++; - } + nDevices++; } return info.str(); } @@ -270,6 +433,8 @@ std::string getPlatformName(const cl::Device &device) { const Platform platform(device.getInfo()); std::string platStr = platform.getInfo(); + // Remove null termination character from the strings + platStr.pop_back(); return platformMap(platStr); } @@ -295,6 +460,17 @@ int getDeviceIdFromNativeId(cl_device_id id) return devId; } +int getActiveDeviceType() +{ + DeviceManager &instance = DeviceManager::getInstance(); + return instance.mDeviceTypes[instance.mActiveQId]; +} + +int getActivePlatform() +{ + DeviceManager &instance = DeviceManager::getInstance(); + return instance.mPlatforms[instance.mActiveQId]; +} const Context& getContext() { DeviceManager& devMngr = DeviceManager::getInstance(); @@ -307,10 +483,54 @@ CommandQueue& getQueue() return *(devMngr.mQueues[devMngr.mActiveQId]); } -const cl::Device& getDevice() +const cl::Device& getDevice(int id) { DeviceManager& devMngr = DeviceManager::getInstance(); - return *(devMngr.mDevices[devMngr.mActiveQId]); + if(id == -1) id = devMngr.mActiveQId; + return *(devMngr.mDevices[id]); +} + +size_t getDeviceMemorySize(int device) +{ + const cl::Device& dev = getDevice(device); + size_t msize = dev.getInfo(); + return msize; +} + +size_t getHostMemorySize() +{ + return common::getHostMemorySize(); +} + +cl_device_type getDeviceType() +{ + cl::Device device = getDevice(); + cl_device_type type = device.getInfo(); + return type; +} + +bool isHostUnifiedMemory(const cl::Device &device) +{ + return device.getInfo(); +} + +bool OpenCLCPUOffload(bool forceOffloadOSX) +{ + static const bool offloadEnv = getEnvVar("AF_OPENCL_CPU_OFFLOAD") == "1"; + bool offload = false; + if(offloadEnv) offload = isHostUnifiedMemory(getDevice()); +#if OS_MAC + // FORCED OFFLOAD FOR LAPACK FUNCTIONS ON OSX UNIFIED MEMORY DEVICES + // + // On OSX Unified Memory devices (Intel), always offload LAPACK but not GEMM + // irrespective of the AF_OPENCL_CPU_OFFLOAD value + // From GEMM, OpenCLCPUOffload(false) is called which will render the + // variable inconsequential to the returned result. + // + // Issue https://github.com/arrayfire/arrayfire/issues/662 + offload = offload || forceOffloadOSX; +#endif + return offload; } bool isGLSharingSupported() @@ -478,10 +698,133 @@ void DeviceManager::markDeviceForInterop(const int device, const fg::Window* wHa } #endif +void addDeviceContext(cl_device_id dev, cl_context ctx, cl_command_queue que) +{ + try { + DeviceManager& devMngr = DeviceManager::getInstance(); + cl::Device* tDevice = new cl::Device(dev); + cl::Context* tContext = new cl::Context(ctx); + cl::CommandQueue* tQueue = (que==NULL ? + new cl::CommandQueue(*tContext, *tDevice) : new cl::CommandQueue(que)); + devMngr.mDevices.push_back(tDevice); + devMngr.mContexts.push_back(tContext); + devMngr.mQueues.push_back(tQueue); + devMngr.mPlatforms.push_back(getPlatformEnum(*tDevice)); + // FIXME: add OpenGL Interop for user provided contexts later + devMngr.mIsGLSharingOn.push_back(false); + } catch (const cl::Error &ex) { + CL_TO_AF_ERROR(ex); + } +} + +void setDeviceContext(cl_device_id dev, cl_context ctx) +{ + // FIXME: add OpenGL Interop for user provided contexts later + try { + DeviceManager& devMngr = DeviceManager::getInstance(); + const int dCount = devMngr.mDevices.size(); + for (int i=0; ioperator()()==dev && + devMngr.mContexts[i]->operator()()==ctx) { + setDevice(i); + return; + } + } + } catch (const cl::Error &ex) { + CL_TO_AF_ERROR(ex); + } + AF_ERROR("No matching device found", AF_ERR_ARG); +} + +void removeDeviceContext(cl_device_id dev, cl_context ctx) +{ + try { + if (getDevice()() == dev && getContext()()==ctx) { + AF_ERROR("Cannot pop the device currently in use", AF_ERR_ARG); + } + + DeviceManager& devMngr = DeviceManager::getInstance(); + const int dCount = devMngr.mDevices.size(); + int deleteIdx = -1; + for (int i = 0; ioperator()()==dev && + devMngr.mContexts[i]->operator()()==ctx) { + deleteIdx = i; + break; + } + } + if (deleteIdx < (int)devMngr.mUserDeviceOffset) { + AF_ERROR("Cannot pop ArrayFire internal devices", AF_ERR_ARG); + } else if (deleteIdx == -1) { + AF_ERROR("No matching device found", AF_ERR_ARG); + } else { + // FIXME: this case can potentially cause issues due to the + // modification of the device pool stl containers. + + // IF the current active device is enumerated at a position + // that lies ahead of the device that has been requested + // to be removed. We just pop the entries from pool since it + // has no side effects. + devMngr.mDevices.erase(devMngr.mDevices.begin()+deleteIdx); + devMngr.mContexts.erase(devMngr.mContexts.begin()+deleteIdx); + devMngr.mQueues.erase(devMngr.mQueues.begin()+deleteIdx); + devMngr.mPlatforms.erase(devMngr.mPlatforms.begin()+deleteIdx); + // FIXME: add OpenGL Interop for user provided contexts later + devMngr.mIsGLSharingOn.erase(devMngr.mIsGLSharingOn.begin()+deleteIdx); + // OTHERWISE, update(decrement) the `mActive*Id` variables + if (deleteIdx < (int)devMngr.mActiveCtxId) { + --devMngr.mActiveCtxId; + --devMngr.mActiveQId; + } + } + } catch (const cl::Error &ex) { + CL_TO_AF_ERROR(ex); + } +} + +bool synchronize_calls() { + static bool sync = getEnvVar("AF_SYNCHRONOUS_CALLS") == "1"; + return sync; +} + + +unsigned getMaxJitSize() +{ + const int MAX_JIT_LEN = 20; + const int MAX_JIT_LEN_AMD = 16; //FIXME: Change this when bug is fixed + + static int length = 0; + if (length == 0) { + std::string env_var = getEnvVar("AF_OPENCL_MAX_JIT_LEN"); + if (!env_var.empty()) { + length = std::stoi(env_var); + } else { + length = MAX_JIT_LEN; + } + } + + if (getActivePlatform() == AFCL_PLATFORM_AMD) { + return std::min(length, MAX_JIT_LEN_AMD); + } + return length; +} + } using namespace opencl; +af_err afcl_get_device_type(afcl_device_type *res) +{ + *res = (afcl_device_type)getActiveDeviceType(); + return AF_SUCCESS; +} + +af_err afcl_get_platform(afcl_platform *res) +{ + *res = (afcl_platform)getActivePlatform(); + return AF_SUCCESS; +} + af_err afcl_get_context(cl_context *ctx, const bool retain) { *ctx = getContext()(); @@ -508,3 +851,21 @@ af_err afcl_set_device_id(cl_device_id id) setDevice(getDeviceIdFromNativeId(id)); return AF_SUCCESS; } + +af_err afcl_add_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que) +{ + addDeviceContext(dev, ctx, que); + return AF_SUCCESS; +} + +af_err afcl_set_device_context(cl_device_id dev, cl_context ctx) +{ + setDeviceContext(dev, ctx); + return AF_SUCCESS; +} + +af_err afcl_delete_device_context(cl_device_id dev, cl_context ctx) +{ + removeDeviceContext(dev, ctx); + return AF_SUCCESS; +} diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 90f57aed39..42579f89d1 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -21,7 +21,7 @@ namespace opencl class DeviceManager { - friend std::string getInfo(); + friend std::string getDeviceInfo(); friend int getDeviceCount(); @@ -33,7 +33,9 @@ class DeviceManager friend cl::CommandQueue& getQueue(); - friend const cl::Device& getDevice(); + friend const cl::Device& getDevice(int id); + + friend size_t getDeviceMemorySize(int device); friend bool isGLSharingSupported(); @@ -43,8 +45,17 @@ class DeviceManager friend int setDevice(int device); + friend void addDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que); + + friend void setDeviceContext(cl_device_id dev, cl_context cxt); + + friend void removeDeviceContext(cl_device_id dev, cl_context ctx); + + friend int getActiveDeviceType(); + friend int getActivePlatform(); + public: - static const unsigned MAX_DEVICES = 16; + static const unsigned MAX_DEVICES = 32; static DeviceManager& getInstance(); @@ -67,12 +78,13 @@ class DeviceManager private: // Attributes - std::vector mQueues; std::vector mDevices; std::vector mContexts; - std::vector mPlatforms; - std::vector mCtxOffsets; + std::vector mQueues; std::vector mIsGLSharingOn; + std::vector mDeviceTypes; + std::vector mPlatforms; + unsigned mUserDeviceOffset; unsigned mActiveCtxId; unsigned mActiveQId; @@ -80,17 +92,29 @@ class DeviceManager int getBackend(); -std::string getInfo(); +std::string getDeviceInfo(); int getDeviceCount(); int getActiveDeviceId(); +unsigned getMaxJitSize(); + const cl::Context& getContext(); cl::CommandQueue& getQueue(); -const cl::Device& getDevice(); +const cl::Device& getDevice(int id = -1); + +size_t getDeviceMemorySize(int device); + +size_t getHostMemorySize(); + +cl_device_type getDeviceType(); + +bool isHostUnifiedMemory(const cl::Device &device); + +bool OpenCLCPUOffload(bool forceOffloadOSX = true); bool isGLSharingSupported(); @@ -102,6 +126,17 @@ std::string getPlatformName(const cl::Device &device); int setDevice(int device); +void addDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que); + +void setDeviceContext(cl_device_id dev, cl_context cxt); + +void removeDeviceContext(cl_device_id dev, cl_context ctx); + void sync(int device); +bool synchronize_calls(); + +int getActiveDeviceType(); +int getActivePlatform(); + } diff --git a/src/backend/opencl/program.hpp b/src/backend/opencl/program.hpp index 1b76a75ce8..6a2af45131 100644 --- a/src/backend/opencl/program.hpp +++ b/src/backend/opencl/program.hpp @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include @@ -35,8 +36,8 @@ using std::string; #if defined(NDEBUG) #define SHOW_BUILD_INFO(PROG) do { \ - const char *info = getenv("AF_OPENCL_SHOW_BUILD_INFO"); \ - if (info != nullptr && std::strncmp(info,"0", 1) != 0) { \ + std::string info = getEnvVar("AF_OPENCL_SHOW_BUILD_INFO"); \ + if (!info.empty() && info != "0") { \ SHOW_DEBUG_BUILD_INFO(prog); \ } \ } while(0) diff --git a/src/backend/opencl/qr.cpp b/src/backend/opencl/qr.cpp index 9e30b43435..56101a8b97 100644 --- a/src/backend/opencl/qr.cpp +++ b/src/backend/opencl/qr.cpp @@ -9,16 +9,19 @@ #include #include +#include #include #include -#include -#include + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) + #include #include #include #include - -#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include namespace opencl { @@ -27,6 +30,10 @@ template void qr(Array &q, Array &r, Array &t, const Array &orig) { try { + if(OpenCLCPUOffload()) { + return cpu::qr(q, r, t, orig); + } + initBlas(); dim4 iDims = orig.dims(); int M = iDims[0]; @@ -81,6 +88,10 @@ template Array qr_inplace(Array &in) { try { + if(OpenCLCPUOffload()) { + return cpu::qr_inplace(in); + } + initBlas(); dim4 iDims = in.dims(); int M = iDims[0]; diff --git a/src/backend/opencl/set.cpp b/src/backend/opencl/set.cpp index 5604ff4ad9..c37b7c4c4e 100644 --- a/src/backend/opencl/set.cpp +++ b/src/backend/opencl/set.cpp @@ -53,7 +53,7 @@ namespace opencl compute::buffer out_data((*out.get())()); compute::buffer_iterator< type_t > begin(out_data, 0); - compute::buffer_iterator< type_t > end(out_data, out.dims()[0]); + compute::buffer_iterator< type_t > end(out_data, out.elements()); if (!is_sorted) { compute::sort(begin, end, queue); @@ -83,7 +83,7 @@ namespace opencl unique_second = setUnique(second, false); } - size_t out_size = unique_first.dims()[0] + unique_second.dims()[0]; + size_t out_size = unique_first.elements() + unique_second.elements(); Array out = createEmptyArray(dim4(out_size, 1, 1, 1)); compute::command_queue queue(getQueue()()); @@ -93,9 +93,9 @@ namespace opencl compute::buffer out_data((*out.get())()); compute::buffer_iterator< type_t > first_begin(first_data, 0); - compute::buffer_iterator< type_t > first_end(first_data, unique_first.dims()[0]); + compute::buffer_iterator< type_t > first_end(first_data, unique_first.elements()); compute::buffer_iterator< type_t > second_begin(second_data, 0); - compute::buffer_iterator< type_t > second_end(second_data, unique_second.dims()[0]); + compute::buffer_iterator< type_t > second_end(second_data, unique_second.elements()); compute::buffer_iterator< type_t > out_begin(out_data, 0); compute::buffer_iterator< type_t > out_end = compute::set_union( @@ -124,7 +124,7 @@ namespace opencl unique_second = setUnique(second, false); } - size_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]); + size_t out_size = std::max(unique_first.elements(), unique_second.elements()); Array out = createEmptyArray(dim4(out_size, 1, 1, 1)); compute::command_queue queue(getQueue()()); @@ -134,9 +134,9 @@ namespace opencl compute::buffer out_data((*out.get())()); compute::buffer_iterator< type_t > first_begin(first_data, 0); - compute::buffer_iterator< type_t > first_end(first_data, unique_first.dims()[0]); + compute::buffer_iterator< type_t > first_end(first_data, unique_first.elements()); compute::buffer_iterator< type_t > second_begin(second_data, 0); - compute::buffer_iterator< type_t > second_end(second_data, unique_second.dims()[0]); + compute::buffer_iterator< type_t > second_end(second_data, unique_second.elements()); compute::buffer_iterator< type_t > out_begin(out_data, 0); compute::buffer_iterator< type_t > out_end = compute::set_intersection( diff --git a/src/backend/opencl/sift.cpp b/src/backend/opencl/sift.cpp index 5bd940d127..632647ca19 100644 --- a/src/backend/opencl/sift.cpp +++ b/src/backend/opencl/sift.cpp @@ -15,7 +15,7 @@ #include #include -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT #include #endif @@ -34,7 +34,7 @@ unsigned sift(Array& x_out, Array& y_out, Array& score_out, const float img_scale, const float feature_ratio, const bool compute_GLOH) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT unsigned nfeat_out; unsigned desc_len; diff --git a/src/backend/opencl/solve.cpp b/src/backend/opencl/solve.cpp index 6d2bea4b4e..93176752b5 100644 --- a/src/backend/opencl/solve.cpp +++ b/src/backend/opencl/solve.cpp @@ -21,10 +21,14 @@ #include #include #include +#include #include #include +#include +#include + namespace opencl { @@ -32,6 +36,10 @@ template Array solveLU(const Array &A, const Array &pivot, const Array &b, const af_mat_prop options) { + if(OpenCLCPUOffload()) { + return cpu::solveLU(A, pivot, b, options); + } + int N = A.dims()[0]; int NRHS = b.dims()[1]; @@ -219,9 +227,7 @@ Array leastSquares(const Array &a, const Array &b) (*dT)(), tmp.getOffset() + NB * MN, NB, 0, queue); - - std::string pName = getPlatformName(getDevice()); - if(pName.find("NVIDIA") != std::string::npos) + if(getActivePlatform() == AFCL_PLATFORM_NVIDIA) { Array AT = transpose(A, true); cl::Buffer* AT_buf = AT.get(); @@ -261,8 +267,7 @@ Array triangleSolve(const Array &A, const Array &b, const af_mat_prop o cl_event event = 0; cl_command_queue queue = getQueue()(); - std::string pName = getPlatformName(getDevice()); - if(pName.find("NVIDIA") != std::string::npos && (options & AF_MAT_UPPER)) + if(getActivePlatform() == AFCL_PLATFORM_NVIDIA && (options & AF_MAT_UPPER)) { Array AT = transpose(A, true); @@ -296,6 +301,10 @@ template Array solve(const Array &a, const Array &b, const af_mat_prop options) { try { + if(OpenCLCPUOffload()) { + return cpu::solve(a, b, options); + } + initBlas(); if (options & AF_MAT_UPPER || diff --git a/src/backend/opencl/svd.cpp b/src/backend/opencl/svd.cpp index 77f7c8aa37..61da27bdcd 100644 --- a/src/backend/opencl/svd.cpp +++ b/src/backend/opencl/svd.cpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include namespace opencl { @@ -196,6 +198,10 @@ void svd(Array &arrU, template void svdInPlace(Array &s, Array &u, Array &vt, Array &in) { + if(OpenCLCPUOffload()) { + return cpu::svdInPlace(s, u, vt, in); + } + initBlas(); svd(u, s, vt, in, true); } @@ -203,6 +209,10 @@ void svdInPlace(Array &s, Array &u, Array &vt, Array &in) template void svd(Array &s, Array &u, Array &vt, const Array &in) { + if(OpenCLCPUOffload()) { + return cpu::svd(s, u, vt, in); + } + dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; diff --git a/src/backend/opencl/transform.cpp b/src/backend/opencl/transform.cpp index c8e2b69a8b..379fd2a5b7 100644 --- a/src/backend/opencl/transform.cpp +++ b/src/backend/opencl/transform.cpp @@ -18,46 +18,86 @@ namespace opencl { template Array transform(const Array &in, const Array &transform, - const af::dim4 &odims, - const af_interp_type method, const bool inverse) + const af::dim4 &odims, const af_interp_type method, + const bool inverse, const bool perspective) { Array out = createEmptyArray(odims); if(inverse) { - switch(method) { - case AF_INTERP_NEAREST: - kernel::transform - (out, in, transform); - break; - case AF_INTERP_BILINEAR: - kernel::transform - (out, in, transform); - break; - case AF_INTERP_LOWER: - kernel::transform - (out, in, transform); - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; + if (perspective) { + switch(method) { + case AF_INTERP_NEAREST: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_BILINEAR: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_LOWER: + kernel::transform + (out, in, transform); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } + } else { + switch(method) { + case AF_INTERP_NEAREST: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_BILINEAR: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_LOWER: + kernel::transform + (out, in, transform); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } } } else { - switch(method) { - case AF_INTERP_NEAREST: - kernel::transform - (out, in, transform); - break; - case AF_INTERP_BILINEAR: - kernel::transform - (out, in, transform); - break; - case AF_INTERP_LOWER: - kernel::transform - (out, in, transform); - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; + if (perspective) { + switch(method) { + case AF_INTERP_NEAREST: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_BILINEAR: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_LOWER: + kernel::transform + (out, in, transform); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } + } else { + switch(method) { + case AF_INTERP_NEAREST: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_BILINEAR: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_LOWER: + kernel::transform + (out, in, transform); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } } } @@ -68,7 +108,7 @@ namespace opencl #define INSTANTIATE(T) \ template Array transform(const Array &in, const Array &transform, \ const af::dim4 &odims, const af_interp_type method, \ - const bool inverse); + const bool inverse, const bool perspective); INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/backend/opencl/transform.hpp b/src/backend/opencl/transform.hpp index f0b4d4c955..064817a537 100644 --- a/src/backend/opencl/transform.hpp +++ b/src/backend/opencl/transform.hpp @@ -14,5 +14,5 @@ namespace opencl { template Array transform(const Array &in, const Array &tf, const af::dim4 &odims, - const af_interp_type method, const bool inverse); + const af_interp_type method, const bool inverse, const bool perspective); } diff --git a/src/backend/opencl/unary.hpp b/src/backend/opencl/unary.hpp index 5a2cc9e33f..1e363d7dcb 100644 --- a/src/backend/opencl/unary.hpp +++ b/src/backend/opencl/unary.hpp @@ -16,7 +16,7 @@ namespace opencl { template -static const char *unaryName() { return "noop"; } +static const char *unaryName() { return "__noop"; } #define UNARY_DECL(OP, FNAME) \ template<> STATIC_ \ diff --git a/src/backend/util.cpp b/src/backend/util.cpp new file mode 100644 index 0000000000..7c4cd2e614 --- /dev/null +++ b/src/backend/util.cpp @@ -0,0 +1,37 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +/// This file contains platform independent utility functions +#include +#include + +#if defined(OS_WIN) +#include +#endif + +using std::string; + +string getEnvVar(const std::string &key) +{ +#if defined(OS_WIN) + DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation + string retVal; + retVal.resize(bufSize); + bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize); + if (!bufSize) { + return string(""); + } else { + retVal.resize(bufSize); + return retVal; + } +#else + char * str = getenv(key.c_str()); + return str==NULL ? string("") : string(str); +#endif +} diff --git a/src/backend/util.hpp b/src/backend/util.hpp new file mode 100644 index 0000000000..e1cd85a69c --- /dev/null +++ b/src/backend/util.hpp @@ -0,0 +1,16 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +/// This file contains platform independent utility functions + +#include + +#pragma once + +std::string getEnvVar(const std::string &key); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 44192eda3a..5db23714d3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,6 +6,8 @@ SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules") FIND_PACKAGE(CUDA QUIET) FIND_PACKAGE(OpenCL QUIET) +OPTION(BUILD_SINGLE_TEST_FILE "Build tests in a single file" OFF) + # If the tests are not being built at the same time as ArrayFire, # we need to first find the ArrayFire library IF(TARGET afcpu OR TARGET afcuda OR TARGET afopencl OR TARGET af) @@ -18,10 +20,28 @@ ELSE() FIND_PACKAGE(ArrayFire REQUIRED) INCLUDE_DIRECTORIES(${ArrayFire_INCLUDE_DIRS}) OPTION(BUILD_NONFREE "Build Tests for nonfree algorithms" OFF) - IF(${BUILD_NONFREE}) # Add definition. Not required when building with AF - ADD_DEFINITIONS(-DAF_BUILD_SIFT) + + IF(${BUILD_NONFREE}) + MESSAGE(WARNING "Building With NONFREE ON requires the following patents") + SET(BUILD_NONFREE_SIFT ON CACHE BOOL "Build ArrayFire with SIFT") + ELSE(${BUILD_NONFREE}) + UNSET(BUILD_NONFREE_SIFT CACHE) # BUILD_NONFREE_SIFT cannot be built without BUILD_NONFREE ENDIF(${BUILD_NONFREE}) + IF(${BUILD_NONFREE_SIFT}) + ADD_DEFINITIONS(-DAF_BUILD_NONFREE_SIFT) + + MESSAGE(WARNING "Building with SIFT requires the following patents") + + MESSAGE("Method and apparatus for identifying scale invariant features" + "in an image and use of same for locating an object in an image,\" David" + "G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application" + "filed March 8, 1999. Asignee: The University of British Columbia. For" + "further details, contact David Lowe (lowe@cs.ubc.ca) or the" + "University-Industry Liaison Office of the University of British" + "Columbia.") + ENDIF(${BUILD_NONFREE_SIFT}) + # ENABLE_TESTING is required when building only tests # When building from source, enable_testing is picked from from the main # CMakeLists.txt @@ -40,14 +60,36 @@ MACRO(CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS) SET(TEST_FILES ${FILES}) ENDIF(${BACKEND} STREQUAL "unified") - FOREACH(FILE ${TEST_FILES}) + IF (${BUILD_SINGLE_TEST_FILE}) + SET(TEST_NAME test_${BACKEND}) + SET(TEST_NAME_BASIC test_basic_${BACKEND}) + ADD_EXECUTABLE(${TEST_NAME} ${CPP_FILES}) + ADD_EXECUTABLE(${TEST_NAME_BASIC} basic_c.c) + + TARGET_LINK_LIBRARIES(${TEST_NAME} PRIVATE ${AFLIBNAME} + PRIVATE ${THREAD_LIB_FLAG} + PRIVATE ${GTEST_LIBS} + PRIVATE ${OTHER_LIBS}) + + TARGET_LINK_LIBRARIES(${TEST_NAME_BASIC} PRIVATE ${AFLIBNAME} + PRIVATE ${THREAD_LIB_FLAG} + PRIVATE ${GTEST_LIBS} + PRIVATE ${OTHER_LIBS}) + + SET_TARGET_PROPERTIES(${TEST_NAME_BASIC} + PROPERTIES + COMPILE_FLAGS -DAF_${DEF_NAME} + FOLDER "Tests/${BACKEND}") + + ELSE() + FOREACH(FILE ${TEST_FILES}) GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE) SET(TEST_NAME ${FNAME}_${BACKEND}) IF(NOT ${BUILD_NONFREE} AND "${FILE}" MATCHES ".nonfree.") - MESSAGE(STATUS "Removing ${FILE} from ctest") + MESSAGE(STATUS "Removing ${FILE} from ctest") ELSEIF("${FILE}" MATCHES ".manual.") - MESSAGE(STATUS "Removing ${FILE} from ctest") + MESSAGE(STATUS "Removing ${FILE} from ctest") ELSE() ADD_TEST(Test_${TEST_NAME} ${TEST_NAME}) ENDIF() @@ -55,18 +97,27 @@ MACRO(CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS) FILE(GLOB TEST_FILE "${FNAME}.cpp" "${FNAME}.c") ADD_EXECUTABLE(${TEST_NAME} ${TEST_FILE}) TARGET_LINK_LIBRARIES(${TEST_NAME} PRIVATE ${AFLIBNAME} - PRIVATE ${THREAD_LIB_FLAG} - PRIVATE ${GTEST_LIBS} - PRIVATE ${OTHER_LIBS}) + PRIVATE ${THREAD_LIB_FLAG} + PRIVATE ${GTEST_LIBS} + PRIVATE ${OTHER_LIBS}) SET_TARGET_PROPERTIES(${TEST_NAME} - PROPERTIES - COMPILE_FLAGS -DAF_${DEF_NAME} - FOLDER "Tests/${BACKEND}") - ENDFOREACH() + PROPERTIES + COMPILE_FLAGS -DAF_${DEF_NAME} + FOLDER "Tests/${BACKEND}") + ENDFOREACH() + ENDIF() ENDMACRO(CREATE_TESTS) +MACRO(CHECK_AND_CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS) + STRING(TOUPPER ${BACKEND} BACKEND_NAME_UPPER) + MESSAGE(STATUS "TESTS: ${BACKEND_NAME_UPPER} backend is ${BUILD_${BACKEND_NAME_UPPER}}.") + IF(${BUILD_${BACKEND_NAME_UPPER}}) + CREATE_TESTS(${BACKEND} ${AFLIBNAME} "${GTEST_LIBS}" "${OTHER_LIBS}") + ENDIF() +ENDMACRO(CHECK_AND_CREATE_TESTS) + FIND_PACKAGE(Threads REQUIRED) IF(CMAKE_USE_PTHREADS_INIT AND NOT "${APPLE}") SET(THREAD_LIB_FLAG "-pthread") @@ -118,19 +169,19 @@ INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) FILE(GLOB FILES "*.cpp" "*.c") +FILE(GLOB CPP_FILES "*.cpp") LIST(SORT FILES) # Tests execute in alphabetical order -# We only build info.cpp and backend.cpp for Unified backend -SET(UNIFIED_FILES "backend.cpp;info.cpp") +# We only build backend.cpp for Unified backend +SET(UNIFIED_FILES "backend.cpp;main.cpp") LIST(SORT UNIFIED_FILES) # Tests execute in alphabetical order # Next we build each example using every backend. IF(${ArrayFire_CPU_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "TESTS: CPU backend is ON.") - CREATE_TESTS(cpu ${ArrayFire_CPU_LIBRARIES} "${GTEST_LIBRARIES}" "") + OPTION(BUILD_CPU "Build ArrayFire Tests for CPU backend" ON) + CHECK_AND_CREATE_TESTS(cpu ${ArrayFire_CPU_LIBRARIES} "${GTEST_LIBRARIES}" "") ELSEIF(TARGET afcpu) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "TESTS: CPU backend is ON.") - CREATE_TESTS(cpu afcpu "${GTEST_LIBRARIES}" "") + CHECK_AND_CREATE_TESTS(cpu afcpu "${GTEST_LIBRARIES}" "") ELSE() MESSAGE(STATUS "TESTS: CPU backend is OFF. afcpu was not found.") ENDIF() @@ -144,10 +195,11 @@ IF (${CUDA_FOUND}) PATHS ${CUDA_TOOLKIT_ROOT_DIR} DOC "CUDA NVVM Library" ) - MESSAGE(STATUS "TESTS: CUDA backend is ON.") + MARK_AS_ADVANCED(CUDA_NVVM_LIBRARY) # If OSX && CLANG && CUDA < 7 IF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) - CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") + OPTION(BUILD_CUDA "Build ArrayFire Tests for CUDA backend" ON) + CHECK_AND_CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") FOREACH(FILE ${FILES}) GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE) @@ -158,15 +210,15 @@ IF (${CUDA_FOUND}) # ELSE OSX && CLANG && CUDA < 7 ELSE("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) - CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") + OPTION(BUILD_CUDA "Build ArrayFire Tests for CUDA backend" ON) + CHECK_AND_CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") ENDIF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) ELSEIF(TARGET afcuda) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "TESTS: CUDA backend is ON.") # If OSX && CLANG && CUDA < 7 IF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) - CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") + CHECK_AND_CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") FOREACH(FILE ${FILES}) GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE) @@ -177,7 +229,7 @@ IF (${CUDA_FOUND}) # ELSE OSX && CLANG && CUDA < 7 ELSE("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) - CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") + CHECK_AND_CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") ENDIF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) ELSE() @@ -189,12 +241,13 @@ ENDIF() # OpenCL Backend IF (${OpenCL_FOUND}) + INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIRS}) IF(${ArrayFire_OpenCL_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "TESTS: OpenCL backend is ON.") - CREATE_TESTS(opencl ${ArrayFire_OpenCL_LIBRARIES} "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") + OPTION(BUILD_OPENCL "Build ArrayFire Tests for OpenCL backend" ON) + MESSAGE(${OpenCL_LIBRARIES}) + CHECK_AND_CREATE_TESTS(opencl ${ArrayFire_OpenCL_LIBRARIES} "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") ELSEIF(TARGET afopencl) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "TESTS: OpenCL backend is ON.") - CREATE_TESTS(opencl afopencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") + CHECK_AND_CREATE_TESTS(opencl afopencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") ELSE() MESSAGE(STATUS "TESTS: OpenCL backend is OFF. afopencl was not found") ENDIF() @@ -204,11 +257,10 @@ ENDIF() # Unified Backend IF(${ArrayFire_Unified_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "TESTS: UNIFIED backend is ON.") - CREATE_TESTS(unified ${ArrayFire_Unified_LIBRARIES} "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}") + OPTION(BUILD_UNIFIED "Build ArrayFire Tests for Unified backend" ON) + CHECK_AND_CREATE_TESTS(unified ${ArrayFire_Unified_LIBRARIES} "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}") ELSEIF(TARGET af) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "TESTS: UNIFIED backend is ON.") - CREATE_TESTS(unified af "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}") + CHECK_AND_CREATE_TESTS(unified af "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}") ELSE() MESSAGE(STATUS "TESTS: UNIFIED backend is OFF. af was not found.") ENDIF() diff --git a/test/approx1.cpp b/test/approx1.cpp index 7a6b66fce8..e7ea94e51e 100644 --- a/test/approx1.cpp +++ b/test/approx1.cpp @@ -23,6 +23,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/approx2.cpp b/test/approx2.cpp index f1a1accc51..75a650631b 100644 --- a/test/approx2.cpp +++ b/test/approx2.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/array.cpp b/test/array.cpp index 6c1f511410..293b888a8f 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -454,3 +454,34 @@ TEST(Device, unequal) ASSERT_EQ(ptr, b.device()); } } + +TEST(DeviceId, Same) +{ + array a = randu(5,5); + ASSERT_EQ(getDevice(), getDeviceId(a)); +} + +TEST(DeviceId, Different) +{ + int ndevices = getDeviceCount(); + if (ndevices < 2) return; + + int id0 = getDevice(); + int id1 = (id0 + 1) % ndevices; + + array a = randu(5,5); + ASSERT_EQ(getDeviceId(a), id0); + setDevice(id1); + + array b = randu(5,5); + + ASSERT_EQ(getDeviceId(a), id0); + ASSERT_EQ(getDeviceId(b), id1); + ASSERT_NE(getDevice(), getDeviceId(a)); + ASSERT_EQ(getDevice(), getDeviceId(b)); + + af_array c; + af_err err = af_matmul(&c, a.get(), b.get(), AF_MAT_NONE, AF_MAT_NONE); + ASSERT_EQ(err, AF_ERR_DEVICE); + setDevice(id0); +} diff --git a/test/backend.cpp b/test/backend.cpp index 59b8fd5129..78b64309db 100644 --- a/test/backend.cpp +++ b/test/backend.cpp @@ -21,14 +21,35 @@ using std::string; using std::vector; +const char *getActiveBackendString(af_backend active) +{ + switch(active) { + case AF_BACKEND_CPU : return "AF_BACKEND_CPU"; + case AF_BACKEND_CUDA : return "AF_BACKEND_CUDA"; + case AF_BACKEND_OPENCL: return "AF_BACKEND_OPENCL"; + default : return "AF_BACKEND_DEFAULT"; + } +} + template void testFunction() { af_info(); + af_backend activeBackend = (af_backend)0; + af_get_active_backend(&activeBackend); + + printf("Active Backend Enum = %s\n", getActiveBackendString(activeBackend)); + af_array outArray = 0; dim_t dims[] = {32, 32}; ASSERT_EQ(AF_SUCCESS, af_randu(&outArray, 2, dims, (af_dtype) af::dtype_traits::af_type)); + + // Verify backends returned by array and by function are the same + af_backend arrayBackend = (af_backend)0; + af_get_backend_id(&arrayBackend, outArray); + ASSERT_EQ(arrayBackend, activeBackend); + // cleanup if(outArray != 0) ASSERT_EQ(AF_SUCCESS, af_release_array(outArray)); } @@ -37,10 +58,15 @@ void backendTest() { int backends = af::getAvailableBackends(); + ASSERT_NE(backends, 0); + bool cpu = backends & AF_BACKEND_CPU; bool cuda = backends & AF_BACKEND_CUDA; bool opencl = backends & AF_BACKEND_OPENCL; + printf("\nRunning Default Backend...\n"); + testFunction(); + if(cpu) { printf("\nRunning CPU Backend...\n"); af::setBackend(AF_BACKEND_CPU); diff --git a/test/basic_c.c b/test/basic_c.c index f6c731092a..aac34e142d 100644 --- a/test/basic_c.c +++ b/test/basic_c.c @@ -9,9 +9,11 @@ #include -int main() { +int main() +{ af_array out = 0; dim_t s[] = {10, 10, 1, 1}; af_err e = af_randu(&out, 4, s, f32); + if(out != 0) af_release_array(out); return (AF_SUCCESS != e); } diff --git a/test/bilateral.cpp b/test/bilateral.cpp index f0825e4893..cde330dca4 100644 --- a/test/bilateral.cpp +++ b/test/bilateral.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; template diff --git a/test/binary.cpp b/test/binary.cpp index 477748792f..91ebcbc8b2 100644 --- a/test/binary.cpp +++ b/test/binary.cpp @@ -14,6 +14,7 @@ #include using namespace std; +using std::abs; using namespace af; const int num = 10000; diff --git a/test/cholesky_dense.cpp b/test/cholesky_dense.cpp index 70548d898c..7fd238d215 100644 --- a/test/cholesky_dense.cpp +++ b/test/cholesky_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/convolve.cpp b/test/convolve.cpp index f3ff9fd6ef..fff5ebffea 100644 --- a/test/convolve.cpp +++ b/test/convolve.cpp @@ -17,6 +17,7 @@ using std::vector; using std::string; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/data b/test/data index db4f6e8062..cec85080f1 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit db4f6e80629fb41580ab93208db6b8be958871df +Subproject commit cec85080f12c25486d025d1fb1cf69e1beb03e58 diff --git a/test/diagonal.cpp b/test/diagonal.cpp index c88f0fbeb1..c4becab2dc 100644 --- a/test/diagonal.cpp +++ b/test/diagonal.cpp @@ -14,6 +14,7 @@ using namespace af; using std::vector; +using std::abs; template class Diagonal : public ::testing::Test diff --git a/test/dot.cpp b/test/dot.cpp index a25f59f27e..58cfbb2ed6 100644 --- a/test/dot.cpp +++ b/test/dot.cpp @@ -18,6 +18,7 @@ using std::vector; using std::string; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/fast.cpp b/test/fast.cpp index a114a8fdc6..8cb90574a6 100644 --- a/test/fast.cpp +++ b/test/fast.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct @@ -27,7 +28,7 @@ typedef struct float f[5]; } feat_t; -bool feat_cmp(feat_t i, feat_t j) +static bool feat_cmp(feat_t i, feat_t j) { for (int k = 0; k < 5; k++) if (i.f[k] != j.f[k]) @@ -36,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j) return false; } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { diff --git a/test/fft.cpp b/test/fft.cpp index 84f0e2382e..48ff865d2a 100644 --- a/test/fft.cpp +++ b/test/fft.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/fft_real.cpp b/test/fft_real.cpp index c8d9a55ff0..8cd6612712 100644 --- a/test/fft_real.cpp +++ b/test/fft_real.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/fftconvolve.cpp b/test/fftconvolve.cpp index cd82ab20d9..ec6a3f3279 100644 --- a/test/fftconvolve.cpp +++ b/test/fftconvolve.cpp @@ -17,6 +17,7 @@ using std::vector; using std::string; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/getting_started.cpp b/test/getting_started.cpp index 12d0b6b1de..9d77af2b30 100644 --- a/test/getting_started.cpp +++ b/test/getting_started.cpp @@ -15,6 +15,7 @@ using namespace af; using std::vector; +using std::abs; TEST(GettingStarted, SNIPPET_getting_started_gen) { diff --git a/test/gloh_nonfree.cpp b/test/gloh_nonfree.cpp index a65d52ad43..5794051152 100644 --- a/test/gloh_nonfree.cpp +++ b/test/gloh_nonfree.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct @@ -38,7 +39,8 @@ typedef struct float d[272]; } desc_t; -bool feat_cmp(feat_desc_t i, feat_desc_t j) +#ifdef AF_BUILD_NONFREE_SIFT +static bool feat_cmp(feat_desc_t i, feat_desc_t j) { for (int k = 0; k < 5; k++) if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f)) @@ -47,7 +49,7 @@ bool feat_cmp(feat_desc_t i, feat_desc_t j) return true; } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -61,7 +63,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -75,7 +77,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { @@ -87,7 +89,7 @@ void array_to_feat(vector& feat, float *x, float *y, float *score, float } } -void split_feat_desc(vector& fd, vector& f, vector& d) +static void split_feat_desc(vector& fd, vector& f, vector& d) { f.resize(fd.size()); d.resize(fd.size()); @@ -102,7 +104,7 @@ void split_feat_desc(vector& fd, vector& f, vector& } } -unsigned popcount(unsigned x) +static unsigned popcount(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); @@ -112,7 +114,7 @@ unsigned popcount(unsigned x) return x & 0x0000003F; } -bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f) +static bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f) { bool ret = true; float sum = 0.0f; @@ -142,6 +144,7 @@ bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float return ret; } +#endif template class GLOH : public ::testing::Test @@ -157,7 +160,7 @@ TYPED_TEST_CASE(GLOH, TestTypes); template void glohTest(string pTestFile) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT if (noDoubleTests()) return; if (noImageIOTests()) return; @@ -269,7 +272,7 @@ void glohTest(string pTestFile) // TEST(GLOH, CPP) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT if (noDoubleTests()) return; if (noImageIOTests()) return; diff --git a/test/harris.cpp b/test/harris.cpp index 276a3e357f..0adde6f95d 100644 --- a/test/harris.cpp +++ b/test/harris.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct @@ -27,7 +28,7 @@ typedef struct float f[5]; } feat_t; -bool feat_cmp(feat_t i, feat_t j) +static bool feat_cmp(feat_t i, feat_t j) { for (int k = 0; k < 5; k++) if (i.f[k] != j.f[k]) @@ -36,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j) return false; } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { diff --git a/test/histogram.cpp b/test/histogram.cpp index f1d7af51b9..c83ba0464f 100644 --- a/test/histogram.cpp +++ b/test/histogram.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; template class Histogram : public ::testing::Test diff --git a/test/homography.cpp b/test/homography.cpp index 662b7a2a56..1bd24425be 100644 --- a/test/homography.cpp +++ b/test/homography.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; template diff --git a/test/imageio.cpp b/test/imageio.cpp index d19aac346c..4029de5a1b 100644 --- a/test/imageio.cpp +++ b/test/imageio.cpp @@ -36,8 +36,6 @@ typedef ::testing::Types TestTypes; // register the type list TYPED_TEST_CASE(ImageIO, TestTypes); -// Disable tests if FreeImage is not found -#if defined(WITH_FREEIMAGE) void loadImageTest(string pTestFile, string pImageFile, const bool isColor) { if (noDoubleTests()) return; @@ -251,4 +249,140 @@ TEST(ImageMem, SaveMemBMP) af::deleteImageMem(savedMem); } -#endif // WITH_FREEIMAGE +TEST(ImageIO, LoadImage16CPP) +{ + if (noImageIOTests()) return; + + vector numDims; + + vector > in; + vector > tests; + readTests(string(TEST_DIR"/imageio/color_seq_16.test"),numDims,in,tests); + + af::dim4 dims = numDims[0]; + + af::array img = af::loadImage(string(TEST_DIR"/imageio/color_seq_16.png").c_str(), true); + ASSERT_EQ(img.type(), f32); // loadImage should always return float + + // Get result + float *imgData = new float[dims.elements()]; + img.host((void*)imgData); + + // Compare result + size_t nElems = in[0].size(); + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(in[0][elIter], imgData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] imgData; +} + +TEST(ImageIO, SaveImage16CPP) +{ + if (noImageIOTests()) return; + + af::dim4 dims(16, 24, 3); + + af::array input = af::randu(dims, u16); + af::array input_255 = (input / 257).as(u16); + + af::saveImage("saveImage16CPP.png", input); + + af::array img = af::loadImage("saveImage16CPP.png", true); + ASSERT_EQ(img.type(), f32); // loadImage should always return float + + ASSERT_FALSE(af::anyTrue(abs(img - input_255))); +} + +//////////////////////////////////////////////////////////////////////////////// +// Image IO Native Tests +//////////////////////////////////////////////////////////////////////////////// + +template +void loadImageNativeCPPTest(string pTestFile, string pImageFile) +{ + if (noImageIOTests()) return; + + vector numDims; + + vector > in; + vector > tests; + readTests(pTestFile,numDims,in,tests); + + af::dim4 dims = numDims[0]; + af::array img = af::loadImageNative(pImageFile.c_str()); + ASSERT_EQ(img.type(), (af_dtype)af::dtype_traits::af_type); + + // Get result + T *imgData = new T[dims.elements()]; + img.host((void*)imgData); + + // Compare result + size_t nElems = in[0].size(); + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(in[0][elIter], imgData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] imgData; +} + +TEST(ImageIONative, LoadImageNative8CPP) +{ + loadImageNativeCPPTest(string(TEST_DIR"/imageio/color_small.test"), + string(TEST_DIR"/imageio/color_small.png")); +} + +TEST(ImageIONative, LoadImageNative16SmallCPP) +{ + loadImageNativeCPPTest(string(TEST_DIR"/imageio/color_small_16.test"), + string(TEST_DIR"/imageio/color_small_16.png")); +} + +TEST(ImageIONative, LoadImageNative16ColorCPP) +{ + loadImageNativeCPPTest(string(TEST_DIR"/imageio/color_seq_16.test"), + string(TEST_DIR"/imageio/color_seq_16.png")); +} + +TEST(ImageIONative, LoadImageNative16GrayCPP) +{ + loadImageNativeCPPTest(string(TEST_DIR"/imageio/gray_seq_16.test"), + string(TEST_DIR"/imageio/gray_seq_16.png")); +} + +template +void saveLoadImageNativeCPPTest(af::dim4 dims) +{ + if (noImageIOTests()) return; + + af::array input = af::randu(dims, (af_dtype)af::dtype_traits::af_type); + + af::saveImageNative("saveImageNative.png", input); + + af::array loaded = af::loadImageNative("saveImageNative.png"); + ASSERT_EQ(loaded.type(), input.type()); + + ASSERT_FALSE(af::anyTrue(input - loaded)); +} + +TEST(ImageIONative, SaveLoadImageNative8CPP) +{ + saveLoadImageNativeCPPTest(af::dim4(480, 720, 3, 1)); +} + +TEST(ImageIONative, SaveLoadImageNative16SmallCPP) +{ + saveLoadImageNativeCPPTest(af::dim4(8, 12, 3, 1)); +} + +TEST(ImageIONative, SaveLoadImageNative16ColorCPP) +{ + saveLoadImageNativeCPPTest(af::dim4(480, 720, 3, 1)); +} + +TEST(ImageIONative, SaveLoadImageNative16GrayCPP) +{ + saveLoadImageNativeCPPTest(af::dim4(24, 32, 1, 1)); +} diff --git a/test/internal.cpp b/test/internal.cpp new file mode 100644 index 0000000000..75fa54fdb9 --- /dev/null +++ b/test/internal.cpp @@ -0,0 +1,124 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +TEST(Internal, CreateStrided) +{ + float ha[] = {1, + 101, 102, 103, 104, 105, + 201, 202, 203, 204, 205, + 301, 302, 303, 304, 305, + 401, 402, 403, 404, 405, + + 1010, 1020, 1030, 1040, 1050, + 2010, 2020, 2030, 2040, 2050, + 3010, 3020, 3030, 3040, 3050, + 4010, 4020, 4030, 4040, 4050}; + + dim_t offset = 1; + unsigned ndims = 3; + dim_t dims[] = {3, 3, 2}; + dim_t strides[] = {1, 5, 20}; + af::array a = createStridedArray((void *)ha, + offset, + af::dim4(ndims, dims), + af::dim4(ndims, strides), + f32, + afHost); + + af::dim4 astrides = getStrides(a); + af::dim4 adims = a.dims(); + + ASSERT_EQ(offset, getOffset(a)); + for (int i = 0; i < (int)ndims; i++) { + ASSERT_EQ(strides[i], astrides[i]); + ASSERT_EQ(dims[i], adims[i]); + } + + std::vector va(a.elements()); + a.host(&va[0]); + + int o = offset; + for (int k = 0; k < dims[2]; k++) { + for (int j = 0; j < dims[1]; j++) { + for (int i = 0; i < dims[0]; i++) { + ASSERT_EQ(va[i + j * dims[0] + k * dims[0] * dims[1]], + ha[i * strides[0] + j * strides[1] + k * strides[2] + o]) + << "at (" + << i << "," + << j << "," + << k << ")"; + } + } + } +} + +TEST(Internal, CheckInfo) +{ + int xdim = 10; + int ydim = 8; + + int xoff = 1; + int yoff = 2; + + int xnum = 5; + int ynum = 3; + + af::array a = af::randu(10, 8); + + af::array b = a(af::seq(xoff, xoff + xnum - 1), + af::seq(yoff, yoff + ynum - 1)); + + af::dim4 strides = getStrides(b); + af::dim4 dims = b.dims(); + + dim_t offset = xoff + yoff * xdim; + + ASSERT_EQ(dims[0], xnum); + ASSERT_EQ(dims[1], ynum); + ASSERT_EQ(isOwner(a), true); + ASSERT_EQ(isOwner(b), false); + + ASSERT_EQ(getOffset(b), offset); + ASSERT_EQ(strides[0], 1); + ASSERT_EQ(strides[1], xdim); + ASSERT_EQ(strides[2], xdim * ydim); + ASSERT_EQ(getRawPtr(a), getRawPtr(b)); +} + +TEST(Internal, Linear) +{ + af::array c; + { + af::array a = af::randu(10, 8); + + // b is just pointing to same underlying data + // b is an owner; + af::array b = a; + ASSERT_EQ(isOwner(b), true); + + // C is considered sub array + // C will not be an owner + c = a(af::span); + ASSERT_EQ(isOwner(c), false); + } + + // Even though a and b are out of scope, c is still not an owner + { + ASSERT_EQ(isOwner(c), false); + } +} diff --git a/test/inverse_dense.cpp b/test/inverse_dense.cpp index b0568ebbdb..1b990b6900 100644 --- a/test/inverse_dense.cpp +++ b/test/inverse_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/jit.cpp b/test/jit.cpp index 3c2308d5eb..a20b0f4b19 100644 --- a/test/jit.cpp +++ b/test/jit.cpp @@ -65,3 +65,53 @@ TEST(JIT, CPP_JIT_HASH) delete[] hF2; } } + +TEST(JIT, CPP_JIT_Reset_Binary) +{ + using af::array; + + af::array a = af::constant(2, 5,5); + af::array b = af::constant(1, 5,5); + af::array c = a + b; + af::array d = a - b; + af::array e = c * d; + e.eval(); + af::array f = c - d; + f.eval(); + af::array g = d - c; + g.eval(); + + std::vector hf(f.elements()); + std::vector hg(g.elements()); + f.host(&hf[0]); + g.host(&hg[0]); + + for (int i = 0; i < (int)f.elements(); i++) { + ASSERT_EQ(hf[i], -hg[i]); + } +} + +TEST(JIT, CPP_JIT_Reset_Unary) +{ + using af::array; + + af::array a = af::constant(2, 5,5); + af::array b = af::constant(1, 5,5); + af::array c = af::sin(a); + af::array d = af::cos(b); + af::array e = c * d; + e.eval(); + af::array f = c - d; + f.eval(); + af::array g = d - c; + g.eval(); + + std::vector hf(f.elements()); + std::vector hg(g.elements()); + f.host(&hf[0]); + g.host(&hg[0]); + + for (int i = 0; i < (int)f.elements(); i++) { + ASSERT_EQ(hf[i], -hg[i]); + } +} diff --git a/test/lu_dense.cpp b/test/lu_dense.cpp index cdb23ef962..0783fb3425 100644 --- a/test/lu_dense.cpp +++ b/test/lu_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/main.cpp b/test/main.cpp new file mode 100644 index 0000000000..76f841f1b1 --- /dev/null +++ b/test/main.cpp @@ -0,0 +1,6 @@ +#include + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/math.cpp b/test/math.cpp index 035ca257d2..e286e2a202 100644 --- a/test/math.cpp +++ b/test/math.cpp @@ -14,6 +14,7 @@ using namespace std; using namespace af; +using std::abs; const int num = 10000; const float flt_err = 1e-3; diff --git a/test/meanshift.cpp b/test/meanshift.cpp index 0116a5e3da..a35ca288d9 100644 --- a/test/meanshift.cpp +++ b/test/meanshift.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; template @@ -64,11 +65,12 @@ void meanshiftTest(string pTestFile) for (size_t testId=0; testId(&inArray, inArray_f32)); - ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray, outFiles[testId].c_str(), isColor)); + ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray_f32, outFiles[testId].c_str(), isColor)); + ASSERT_EQ(AF_SUCCESS, conv_image(&goldArray, goldArray_f32)); // af_load_image always returns float array ASSERT_EQ(AF_SUCCESS, af_get_elements(&nElems, goldArray)); ASSERT_EQ(AF_SUCCESS, af_mean_shift(&outArray, inArray, 2.25f, 25.56f, 5, isColor)); @@ -93,6 +96,7 @@ void meanshiftTest(string pTestFile) ASSERT_EQ(AF_SUCCESS, af_release_array(inArray_f32)); ASSERT_EQ(AF_SUCCESS, af_release_array(outArray)); ASSERT_EQ(AF_SUCCESS, af_release_array(goldArray)); + ASSERT_EQ(AF_SUCCESS, af_release_array(goldArray_f32)); } } diff --git a/test/medfilt.cpp b/test/medfilt.cpp index 9b4590885b..2e3a1fcb6b 100644 --- a/test/medfilt.cpp +++ b/test/medfilt.cpp @@ -17,6 +17,7 @@ using std::string; using std::vector; +using std::abs; template class MedianFilter : public ::testing::Test diff --git a/test/morph.cpp b/test/morph.cpp index d9c5282146..c42ddf0cba 100644 --- a/test/morph.cpp +++ b/test/morph.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; template class Morph : public ::testing::Test diff --git a/test/ocl_ext_context.cpp b/test/ocl_ext_context.cpp new file mode 100644 index 0000000000..e711c631e4 --- /dev/null +++ b/test/ocl_ext_context.cpp @@ -0,0 +1,131 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#if defined(AF_OPENCL) +#include +#include + +using namespace std; + +inline void checkErr(cl_int err, const char * name) { + if (err != CL_SUCCESS) { + std::cerr << "ERROR: " << name << " (" << err << ")" << std::endl; + exit(EXIT_FAILURE); + } +} + +void getExternals(cl_device_id &deviceId, cl_context &context, cl_command_queue &queue) +{ + static cl_device_id dId = NULL; + static cl_context cId = NULL; + static cl_command_queue qId = NULL; + static bool call_once = true; + + if (call_once) { + cl_platform_id platformId = NULL; + cl_uint numPlatforms; + cl_uint numDevices; + cl_int errorCode = 0; + + checkErr(clGetPlatformIDs(1, &platformId, &numPlatforms), + "Get Platforms failed"); + + checkErr(clGetDeviceIDs(platformId, CL_DEVICE_TYPE_DEFAULT, 1, &dId, &numDevices), + "Get cl_device_id failed"); + + cId = clCreateContext(NULL, 1, &dId, NULL, NULL, &errorCode); + checkErr(errorCode, "Context creation failed"); + + qId = clCreateCommandQueue(cId, dId, 0, &errorCode); + checkErr(errorCode, "Command queue creation failed"); + call_once = false; + } + deviceId = dId; + context = cId; + queue = qId; +} + +TEST(OCLExtContext, push) +{ + cl_device_id deviceId = NULL; + cl_context context = NULL; + cl_command_queue queue = NULL; + + getExternals(deviceId, context, queue); + int dCount = af::getDeviceCount(); + printf("%d devices before afcl::addDevice\n", dCount); + af::info(); + afcl::addDevice(deviceId, context, queue); + ASSERT_EQ(true, dCount+1==af::getDeviceCount()); + printf("%d devices after afcl::addDevice\n", af::getDeviceCount()); + af::info(); +} + +TEST(OCLExtContext, set) +{ + cl_device_id deviceId = NULL; + cl_context context = NULL; + cl_command_queue queue = NULL; + + getExternals(deviceId, context, queue); + afcl::setDevice(deviceId, context); + af::info(); + + const int x = 5; + const int y = 5; + const int s = x * y; + af::array a = af::constant(1, x, y); + vector host(s); + a.host((void*)host.data()); + for (int i=0; i& feat, float* x, float* y, float* score, float* ori, float* size, unsigned* desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, unsigned* desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -61,7 +62,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -75,7 +76,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { @@ -87,7 +88,7 @@ void array_to_feat(vector& feat, float *x, float *y, float *score, float } } -void split_feat_desc(vector& fd, vector& f, vector& d) +static void split_feat_desc(vector& fd, vector& f, vector& d) { f.resize(fd.size()); d.resize(fd.size()); @@ -102,7 +103,7 @@ void split_feat_desc(vector& fd, vector& f, vector& } } -unsigned popcount(unsigned x) +static unsigned popcount(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); diff --git a/test/qr_dense.cpp b/test/qr_dense.cpp index 708eb5d0cd..e3809546b1 100644 --- a/test/qr_dense.cpp +++ b/test/qr_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/random.cpp b/test/random.cpp index 29f157a776..74f7e6541b 100644 --- a/test/random.cpp +++ b/test/random.cpp @@ -59,6 +59,7 @@ void randuTest(af::dim4 & dims) af_array outArray = 0; ASSERT_EQ(AF_SUCCESS, af_randu(&outArray, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits::af_type)); + ASSERT_EQ(af_sync(-1), AF_SUCCESS); if(outArray != 0) af_release_array(outArray); } @@ -69,6 +70,7 @@ void randnTest(af::dim4 &dims) af_array outArray = 0; ASSERT_EQ(AF_SUCCESS, af_randn(&outArray, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits::af_type)); + ASSERT_EQ(af_sync(-1), AF_SUCCESS); if(outArray != 0) af_release_array(outArray); } @@ -124,6 +126,7 @@ void randuArgsTest() dim_t dims[] = {1, 2, 3, 0}; af_array outArray = 0; ASSERT_EQ(AF_ERR_SIZE, af_randu(&outArray, ndims, dims, (af_dtype) af::dtype_traits::af_type)); + ASSERT_EQ(af_sync(-1), AF_SUCCESS); if(outArray != 0) af_release_array(outArray); } @@ -143,6 +146,7 @@ TEST(Random, CPP) af::dim4 dims(1, 2, 3, 1); af::array out1 = af::randu(dims); af::array out2 = af::randn(dims); + af::sync(); } template diff --git a/test/rank_dense.cpp b/test/rank_dense.cpp index d0a19af3b9..7f2e76db0d 100644 --- a/test/rank_dense.cpp +++ b/test/rank_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/reduce.cpp b/test/reduce.cpp index f71dc76b80..675ed8fc4a 100644 --- a/test/reduce.cpp +++ b/test/reduce.cpp @@ -109,16 +109,6 @@ void reduceTest(string pTestFile, int off = 0, bool isSubRef=false, const vector ASSERT_EQ(AF_SUCCESS, af_release_array(inArray)); } -vector init_subs() -{ - vector subs; - subs.push_back(af_make_seq(2, 6, 1)); - subs.push_back(af_make_seq(1, 5, 1)); - subs.push_back(af_make_seq(1, 3, 1)); - subs.push_back(af_make_seq(1, 2, 1)); - return subs; -} - template struct promote_type { typedef T type; diff --git a/test/replace.cpp b/test/replace.cpp index 9e99eaee8f..faa5636eb8 100644 --- a/test/replace.cpp +++ b/test/replace.cpp @@ -130,3 +130,46 @@ TEST(Replace, NaN) ASSERT_EQ(hc[i], std::isnan(ha[i]) ? b : ha[i]); } } + +TEST(Replace, ISSUE_1249) +{ + dim4 dims(2, 3, 4); + array cond = af::randu(dims) > 0.5; + array a = af::randu(dims); + array b = a.copy(); + replace(b, !cond, a - a * 0.9); + array c = a - a * cond * 0.9; + + int num = (int)dims.elements(); + std::vector hb(num); + std::vector hc(num); + + b.host(&hb[0]); + c.host(&hc[0]); + + for (int i = 0; i < num; i++) { + ASSERT_EQ(hc[i], hb[i]) << "at " << i; + } +} + + +TEST(Replace, 4D) +{ + dim4 dims(2, 3, 4, 2); + array cond = af::randu(dims) > 0.5; + array a = af::randu(dims); + array b = a.copy(); + replace(b, !cond, a - a * 0.9); + array c = a - a * cond * 0.9; + + int num = (int)dims.elements(); + std::vector hb(num); + std::vector hc(num); + + b.host(&hb[0]); + c.host(&hc[0]); + + for (int i = 0; i < num; i++) { + ASSERT_EQ(hc[i], hb[i]) << "at " << i; + } +} diff --git a/test/resize.cpp b/test/resize.cpp index 6ec4e553c6..6c29e61cc6 100644 --- a/test/resize.cpp +++ b/test/resize.cpp @@ -20,6 +20,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; @@ -64,7 +65,7 @@ TYPED_TEST(Resize, InvalidDims) { if (noDoubleTests()) return; - vector in(8,8); + vector in(8*8); af_array inArray = 0; af_array outArray = 0; diff --git a/test/rotate.cpp b/test/rotate.cpp index f97cd3ab96..0d4b460033 100644 --- a/test/rotate.cpp +++ b/test/rotate.cpp @@ -20,6 +20,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/rotate_linear.cpp b/test/rotate_linear.cpp index 29a9107e4c..15734a3cc2 100644 --- a/test/rotate_linear.cpp +++ b/test/rotate_linear.cpp @@ -20,11 +20,12 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; template -class Rotate : public ::testing::Test +class RotateLinear : public ::testing::Test { public: virtual void SetUp() { @@ -39,7 +40,7 @@ class Rotate : public ::testing::Test typedef ::testing::Types TestTypes; // register the type list -TYPED_TEST_CASE(Rotate, TestTypes); +TYPED_TEST_CASE(RotateLinear, TestTypes); #define PI 3.1415926535897931f @@ -107,10 +108,10 @@ void rotateTest(string pTestFile, const unsigned resultIdx, const float angle, c if(tempArray != 0) af_release_array(tempArray); } -#define ROTATE_INIT(desc, file, resultIdx, angle, crop, recenter) \ - TYPED_TEST(Rotate, desc) \ - { \ - rotateTest(string(TEST_DIR"/rotate/"#file".test"), resultIdx, angle, crop, recenter);\ +#define ROTATE_INIT(desc, file, resultIdx, angle, crop, recenter) \ + TYPED_TEST(RotateLinear, desc) \ + { \ + rotateTest(string(TEST_DIR"/rotate/"#file".test"), resultIdx, angle, crop, recenter); \ } ROTATE_INIT(Square180NoCropRecenter , rotatelinear1, 0, 180, false, true); @@ -165,7 +166,7 @@ void rotateTest(string pTestFile, const unsigned resultIdx, const float angle, c ////////////////////////////////// CPP ////////////////////////////////////// -TEST(Rotate, CPP) +TEST(RotateLinear, CPP) { if (noDoubleTests()) return; diff --git a/test/scan.cpp b/test/scan.cpp index 386568d402..34a077f122 100644 --- a/test/scan.cpp +++ b/test/scan.cpp @@ -82,16 +82,6 @@ void scanTest(string pTestFile, int off = 0, bool isSubRef=false, const vector init_subs() -{ - vector subs; - subs.push_back(af_make_seq(2, 6, 1)); - subs.push_back(af_make_seq(1, 5, 1)); - subs.push_back(af_make_seq(1, 3, 1)); - subs.push_back(af_make_seq(1, 2, 1)); - return subs; -} - #define SCAN_TESTS(FN, TAG, Ti, To) \ TEST(Scan,Test_##FN##_##TAG) \ { \ diff --git a/test/select.cpp b/test/select.cpp index 1c39282b15..6e772ac7c4 100644 --- a/test/select.cpp +++ b/test/select.cpp @@ -136,3 +136,43 @@ TEST(Select, NaN) ASSERT_EQ(hc[i], std::isnan(ha[i]) ? b : ha[i]); } } + +TEST(Select, ISSUE_1249) +{ + dim4 dims(2, 3, 4); + array cond = af::randu(dims) > 0.5; + array a = af::randu(dims); + array b = select(cond, a - a * 0.9, a); + array c = a - a * cond * 0.9; + + int num = (int)dims.elements(); + std::vector hb(num); + std::vector hc(num); + + b.host(&hb[0]); + c.host(&hc[0]); + + for (int i = 0; i < num; i++) { + ASSERT_EQ(hc[i], hb[i]) << "at " << i; + } +} + +TEST(Select, 4D) +{ + dim4 dims(2, 3, 4, 2); + array cond = af::randu(dims) > 0.5; + array a = af::randu(dims); + array b = select(cond, a - a * 0.9, a); + array c = a - a * cond * 0.9; + + int num = (int)dims.elements(); + std::vector hb(num); + std::vector hc(num); + + b.host(&hb[0]); + c.host(&hc[0]); + + for (int i = 0; i < num; i++) { + ASSERT_EQ(hc[i], hb[i]) << "at " << i; + } +} diff --git a/test/sift_nonfree.cpp b/test/sift_nonfree.cpp index cf1683f775..6776c18a86 100644 --- a/test/sift_nonfree.cpp +++ b/test/sift_nonfree.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct @@ -37,8 +38,8 @@ typedef struct { float d[128]; } desc_t; - -bool feat_cmp(feat_desc_t i, feat_desc_t j) +#ifdef AF_BUILD_NONFREE_SIFT +static bool feat_cmp(feat_desc_t i, feat_desc_t j) { for (int k = 0; k < 5; k++) if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f)) @@ -47,7 +48,7 @@ bool feat_cmp(feat_desc_t i, feat_desc_t j) return true; } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -61,7 +62,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -75,7 +76,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { @@ -87,7 +88,7 @@ void array_to_feat(vector& feat, float *x, float *y, float *score, float } } -void split_feat_desc(vector& fd, vector& f, vector& d) +static void split_feat_desc(vector& fd, vector& f, vector& d) { f.resize(fd.size()); d.resize(fd.size()); @@ -102,7 +103,7 @@ void split_feat_desc(vector& fd, vector& f, vector& } } -unsigned popcount(unsigned x) +static unsigned popcount(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); @@ -112,7 +113,7 @@ unsigned popcount(unsigned x) return x & 0x0000003F; } -bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f) +static bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f) { bool ret = true; float sum = 0.0f; @@ -142,6 +143,7 @@ bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float return ret; } +#endif template class SIFT : public ::testing::Test @@ -157,7 +159,7 @@ TYPED_TEST_CASE(SIFT, TestTypes); template void siftTest(string pTestFile, unsigned nLayers, float contrastThr, float edgeThr, float initSigma, bool doubleInput) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT if (noDoubleTests()) return; if (noImageIOTests()) return; @@ -275,7 +277,7 @@ void siftTest(string pTestFile, unsigned nLayers, float contrastThr, float edgeT // TEST(SIFT, CPP) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT if (noDoubleTests()) return; if (noImageIOTests()) return; diff --git a/test/solve_dense.cpp b/test/solve_dense.cpp index bbb67409dc..183afdbcc8 100644 --- a/test/solve_dense.cpp +++ b/test/solve_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; @@ -185,15 +186,12 @@ SOLVE_TESTS(cdouble, 1E-5) #define SOLVE_TESTS(T, eps) \ TEST(SOLVE, T##RectOver) \ { \ - solveTester(800, 600, 50, eps); \ + solveTester(800, 600, 64, eps); \ } SOLVE_TESTS(float, 0.01) SOLVE_TESTS(double, 1E-5) -// Fails on Windows on some devices -#if !(defined(OS_WIN) && defined(AF_OPENCL)) SOLVE_TESTS(cfloat, 0.01) SOLVE_TESTS(cdouble, 1E-5) -#endif #undef SOLVE_TESTS diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp index 3d82b9fd90..ed827c9da5 100644 --- a/test/sort_by_key.cpp +++ b/test/sort_by_key.cpp @@ -26,7 +26,7 @@ using af::cfloat; using af::cdouble; template -class Sort : public ::testing::Test +class SortByKey : public ::testing::Test { public: virtual void SetUp() { @@ -41,7 +41,7 @@ class Sort : public ::testing::Test typedef ::testing::Types TestTypes; // register the type list -TYPED_TEST_CASE(Sort, TestTypes); +TYPED_TEST_CASE(SortByKey, TestTypes); template void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const unsigned resultIdx1, bool isSubRef = false, const vector * seqv = NULL) @@ -104,10 +104,10 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const if(tempArray != 0) af_release_array(tempArray); } -#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1) \ - TYPED_TEST(Sort, desc) \ - { \ - sortTest(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \ +#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1) \ + TYPED_TEST(SortByKey, desc) \ + { \ + sortTest(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \ } SORT_INIT(Sort0True, sort_by_key_tiny, true, 0, 1); @@ -116,9 +116,8 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(Sort10x10False, sort_by_key_2D, false, 2, 3); SORT_INIT(Sort1000True, sort_by_key_1000, true, 0, 1); SORT_INIT(SortMedTrue, sort_by_key_med, true, 0, 1); - // FIXME: below two tests are disabled temporarily until issue#995 is fixed - //SORT_INIT(Sort1000False, sort_by_key_1000, false, 2, 3); - //SORT_INIT(SortMedFalse, sort_by_key_med, false, 2, 3); + SORT_INIT(Sort1000False, sort_by_key_1000, false, 2, 3); + SORT_INIT(SortMedFalse, sort_by_key_med, false, 2, 3); // Takes too much time in current implementation. Enable when everything is parallel //SORT_INIT(SortLargeTrue, sort_by_key_large, true, 0, 1); //SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3); @@ -169,4 +168,3 @@ TEST(SortByKey, CPP) delete[] keyData; delete[] valData; } - diff --git a/test/sort_index.cpp b/test/sort_index.cpp index 0711e8b494..6aa240d5a5 100644 --- a/test/sort_index.cpp +++ b/test/sort_index.cpp @@ -26,7 +26,7 @@ using af::cfloat; using af::cdouble; template -class Sort : public ::testing::Test +class SortIndex : public ::testing::Test { public: virtual void SetUp() { @@ -41,7 +41,7 @@ class Sort : public ::testing::Test typedef ::testing::Types TestTypes; // register the type list -TYPED_TEST_CASE(Sort, TestTypes); +TYPED_TEST_CASE(SortIndex, TestTypes); template void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const unsigned resultIdx1, bool isSubRef = false, const vector * seqv = NULL) @@ -102,10 +102,10 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const if(tempArray != 0) af_release_array(tempArray); } -#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1) \ - TYPED_TEST(Sort, desc) \ - { \ - sortTest(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \ +#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1) \ + TYPED_TEST(SortIndex, desc) \ + { \ + sortTest(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \ } SORT_INIT(Sort0True, sort, true, 0, 1); @@ -117,9 +117,8 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(Sort10x10False, sort_10x10, false, 2, 3); SORT_INIT(Sort1000True, sort_1000, true, 0, 1); SORT_INIT(SortMedTrue, sort_med1, true, 0, 1); - // FIXME: below two tests are disabled temporarily until issue#995 is fixed - //SORT_INIT(Sort1000False, sort_1000, false, 2, 3); - //SORT_INIT(SortMedFalse, sort_med1, false, 2, 3); + SORT_INIT(Sort1000False, sort_1000, false, 2, 3); + SORT_INIT(SortMedFalse, sort_med1, false, 2, 3); // Takes too much time in current implementation. Enable when everything is parallel //SORT_INIT(SortMed5True, sort_med, true, 0, 1); //SORT_INIT(SortMed5False, sort_med, false, 2, 3); diff --git a/test/susan.cpp b/test/susan.cpp index df806c06be..259a319ce7 100644 --- a/test/susan.cpp +++ b/test/susan.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct @@ -27,7 +28,7 @@ typedef struct float f[5]; } feat_t; -bool feat_cmp(feat_t i, feat_t j) +static bool feat_cmp(feat_t i, feat_t j) { for (int k = 0; k < 5; k++) if (i.f[k] != j.f[k]) @@ -36,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j) return false; } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { diff --git a/test/svd_dense.cpp b/test/svd_dense.cpp index f7ef2950e0..7ce31e2ee5 100644 --- a/test/svd_dense.cpp +++ b/test/svd_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; @@ -34,12 +35,12 @@ typedef ::testing::Types TestTypes; TYPED_TEST_CASE(svd, TestTypes); template -double get_val(T val) +inline double get_val(T val) { return val; } -template<> double get_val(cfloat val) +template<> inline double get_val(cfloat val) { return abs(val); } diff --git a/test/testHelpers.hpp b/test/testHelpers.hpp index 758bf98e14..83f2552e08 100644 --- a/test/testHelpers.hpp +++ b/test/testHelpers.hpp @@ -6,6 +6,8 @@ * The complete license agreement can be obtained at: * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" #include #include @@ -127,11 +129,11 @@ void readTestsFromFile(const std::string &FileName, std::vector &input } } -void readImageTests(const std::string &pFileName, - std::vector &pInputDims, - std::vector &pTestInputs, - std::vector &pTestOutSizes, - std::vector &pTestOutputs) +inline void readImageTests(const std::string &pFileName, + std::vector &pInputDims, + std::vector &pTestInputs, + std::vector &pTestOutSizes, + std::vector &pTestOutputs) { using std::vector; @@ -364,18 +366,18 @@ struct cond_type { }; template -double real(T val) { return (double)val; } +inline double real(T val) { return (double)val; } template<> -double real(af::cdouble val) { return real(val); } +inline double real(af::cdouble val) { return real(val); } template<> -double real (af::cfloat val) { return real(val); } +inline double real (af::cfloat val) { return real(val); } template -double imag(T val) { return (double)val; } +inline double imag(T val) { return (double)val; } template<> -double imag(af::cdouble val) { return imag(val); } +inline double imag(af::cdouble val) { return imag(val); } template<> -double imag (af::cfloat val) { return imag(val); } +inline double imag (af::cfloat val) { return imag(val); } template bool noDoubleTests() @@ -388,37 +390,18 @@ bool noDoubleTests() return ((isTypeDouble && !isDoubleSupported) ? true : false); } -bool noImageIOTests() +inline bool noImageIOTests() { - af_array arr = 0; - const af_err err = af_load_image(&arr, TEST_DIR"/imageio/color_small.png", true); - - if(arr != 0) af_release_array(arr); - - if(err == AF_ERR_NOT_CONFIGURED) - return true; // Yes, disable test - else - return false; // No, let test continue + bool ret = !af::isImageIOAvailable(); + if(ret) printf("Image IO Not Configured. Test will exit\n"); + return ret; } -bool noLAPACKTests() +inline bool noLAPACKTests() { - // Run LU - af::dim4 dims(5, 5); - af_array in = 0, l = 0, u = 0, p= 0; - af_randu(&in, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits::af_type); - - af_err err = af_lu(&l, &u, &p, in); - - if(in != 0) af_release_array(in); - if(l != 0) af_release_array(l); - if(u != 0) af_release_array(u); - if(p != 0) af_release_array(p); - - if(err == AF_ERR_NOT_CONFIGURED) - return true; // Yes, disable test - else - return false; // No, let test continue + bool ret = !af::isLAPACKAvailable(); + if(ret) printf("LAPACK Not Configured. Test will exit\n"); + return ret; } // TODO: perform conversion on device for CUDA and OpenCL @@ -469,3 +452,5 @@ af::array cpu_randu(const af::dim4 dims) return af::array(dims, (T *)&out[0]); } + +#pragma GCC diagnostic pop diff --git a/test/transform.cpp b/test/transform.cpp new file mode 100644 index 0000000000..1950284c2d --- /dev/null +++ b/test/transform.cpp @@ -0,0 +1,268 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +using std::vector; +using std::string; +using std::abs; +using std::cout; +using std::endl; + +template +class Transform : public ::testing::Test +{ + public: + virtual void SetUp() {} +}; + +template +class TransformInt : public ::testing::Test +{ + public: + virtual void SetUp() { + } +}; + +typedef ::testing::Types TestTypes; +typedef ::testing::Types TestTypesInt; + +TYPED_TEST_CASE(Transform, TestTypes); +TYPED_TEST_CASE(TransformInt, TestTypesInt); + +template +void transformTest(string pTestFile, string pHomographyFile, const af_interp_type method, const bool invert) +{ + if (noDoubleTests()) return; + + vector inNumDims; + vector inFiles; + vector goldNumDims; + vector goldFiles; + + readImageTests(pTestFile, inNumDims, inFiles, goldNumDims, goldFiles); + + inFiles[0].insert(0,string(TEST_DIR"/transform/")); + inFiles[1].insert(0,string(TEST_DIR"/transform/")); + goldFiles[0].insert(0,string(TEST_DIR"/transform/")); + + af::dim4 objDims = inNumDims[0]; + + vector HNumDims; + vector > HIn; + vector > HTests; + readTests(pHomographyFile, HNumDims, HIn, HTests); + + af::dim4 HDims = HNumDims[0]; + + af_array sceneArray_f32 = 0; + af_array goldArray_f32 = 0; + af_array outArray_f32 = 0; + af_array sceneArray = 0; + af_array goldArray = 0; + af_array outArray = 0; + af_array HArray = 0; + + ASSERT_EQ(AF_SUCCESS, af_load_image(&sceneArray_f32, inFiles[1].c_str(), false)); + ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray_f32, goldFiles[0].c_str(), false)); + + ASSERT_EQ(AF_SUCCESS, conv_image(&sceneArray, sceneArray_f32)); + ASSERT_EQ(AF_SUCCESS, conv_image(&goldArray, goldArray_f32)); + + ASSERT_EQ(AF_SUCCESS, af_create_array(&HArray, &(HIn[0].front()), HDims.ndims(), HDims.get(), f32)); + + ASSERT_EQ(AF_SUCCESS, af_transform(&outArray, sceneArray, HArray, objDims[0], objDims[1], method, invert)); + + // Get gold data + dim_t goldEl = 0; + ASSERT_EQ(AF_SUCCESS, af_get_elements(&goldEl, goldArray)); + T* goldData = new T[goldEl]; + ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)goldData, goldArray)); + + // Get result + dim_t outEl = 0; + ASSERT_EQ(AF_SUCCESS, af_get_elements(&outEl, outArray)); + T* outData = new T[outEl]; + ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray)); + + const float thr = 1.1f; + + // Maximum number of wrong pixels must be <= 0.01% of number of elements, + // this metric is necessary due to rounding errors between different + // backends for AF_INTERP_NEAREST and AF_INTERP_LOWER + const size_t maxErr = goldEl * 0.0001f; + size_t err = 0; + + for (dim_t elIter = 0; elIter < goldEl; elIter++) { + err += fabs((float)floor(outData[elIter]) - (float)floor(goldData[elIter])) > thr; + if (err > maxErr) + ASSERT_LE(err, maxErr) << "at: " << elIter << std::endl; + } + + delete[] goldData; + delete[] outData; + + if(sceneArray_f32 != 0) af_release_array(sceneArray_f32); + if(goldArray_f32 != 0) af_release_array(goldArray_f32); + if(outArray_f32 != 0) af_release_array(outArray_f32); + if(sceneArray != 0) af_release_array(sceneArray); + if(goldArray != 0) af_release_array(goldArray); + if(outArray != 0) af_release_array(outArray); + if(HArray != 0) af_release_array(HArray); +} + +TYPED_TEST(Transform, PerspectiveNearest) +{ + transformTest(string(TEST_DIR"/transform/tux_nearest.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_NEAREST, false); +} + +TYPED_TEST(Transform, PerspectiveBilinear) +{ + transformTest(string(TEST_DIR"/transform/tux_bilinear.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_BILINEAR, false); +} + +TYPED_TEST(Transform, PerspectiveLower) +{ + transformTest(string(TEST_DIR"/transform/tux_lower.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_LOWER, false); +} + +TYPED_TEST(Transform, PerspectiveNearestInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_nearest.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_NEAREST, true); +} + +TYPED_TEST(Transform, PerspectiveBilinearInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_bilinear.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_BILINEAR, true); +} + +TYPED_TEST(Transform, PerspectiveLowerInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_lower.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_LOWER, true); +} + +TYPED_TEST(TransformInt, PerspectiveNearest) +{ + transformTest(string(TEST_DIR"/transform/tux_nearest.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_NEAREST, false); +} + +TYPED_TEST(TransformInt, PerspectiveBilinear) +{ + transformTest(string(TEST_DIR"/transform/tux_bilinear.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_BILINEAR, false); +} + +TYPED_TEST(TransformInt, PerspectiveLower) +{ + transformTest(string(TEST_DIR"/transform/tux_lower.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_LOWER, false); +} + +TYPED_TEST(TransformInt, PerspectiveNearestInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_nearest.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_NEAREST, true); +} + +TYPED_TEST(TransformInt, PerspectiveBilinearInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_bilinear.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_BILINEAR, true); +} + +TYPED_TEST(TransformInt, PerspectiveLowerInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_lower.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_LOWER, true); +} + + +///////////////////////////////////// CPP //////////////////////////////// +// +TEST(Transform, CPP) +{ + vector inDims; + vector inFiles; + vector goldDim; + vector goldFiles; + + vector HDims; + vector > HIn; + vector > HTests; + readTests(TEST_DIR"/transform/tux_tmat.test",HDims,HIn,HTests); + + readImageTests(string(TEST_DIR"/transform/tux_nearest.test"), inDims, inFiles, goldDim, goldFiles); + + inFiles[0].insert(0,string(TEST_DIR"/transform/")); + inFiles[1].insert(0,string(TEST_DIR"/transform/")); + + goldFiles[0].insert(0,string(TEST_DIR"/transform/")); + + af::array H = af::array(HDims[0][0], HDims[0][1], &(HIn[0].front())); + af::array IH = af::array(HDims[0][0], HDims[0][1], &(HIn[0].front())); + + af::array scene_img = af::loadImage(inFiles[1].c_str(), false); + + af::array gold_img = af::loadImage(goldFiles[0].c_str(), false); + + af::array out_img = af::transform(scene_img, IH, inDims[0][0], inDims[0][1], AF_INTERP_NEAREST, false); + + af::dim4 outDims = out_img.dims(); + af::dim4 goldDims = gold_img.dims(); + + float* h_out_img = new float[outDims[0] * outDims[1]]; + out_img.host(h_out_img); + float* h_gold_img = new float[goldDims[0] * goldDims[1]]; + gold_img.host(h_gold_img); + + const dim_t n = gold_img.elements(); + + const float thr = 1.0f; + + // Maximum number of wrong pixels must be <= 0.01% of number of elements, + // this metric is necessary due to rounding errors between different + // backends for AF_INTERP_NEAREST and AF_INTERP_LOWER + const size_t maxErr = n * 0.0001f; + size_t err = 0; + + for (dim_t elIter = 0; elIter < n; elIter++) { + err += fabs((int)h_out_img[elIter] - h_gold_img[elIter]) > thr; + if (err > maxErr) + ASSERT_LE(err, maxErr) << "at: " << elIter << std::endl; + } + + delete[] h_gold_img; + delete[] h_out_img; +} diff --git a/test/transform_coordinates.cpp b/test/transform_coordinates.cpp new file mode 100644 index 0000000000..7f1ac4e893 --- /dev/null +++ b/test/transform_coordinates.cpp @@ -0,0 +1,118 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +using std::vector; +using std::string; +using std::cout; +using std::endl; + +template +class TransformCoordinates : public ::testing::Test +{ + public: + virtual void SetUp() {} +}; + +typedef ::testing::Types TestTypes; + +TYPED_TEST_CASE(TransformCoordinates, TestTypes); + +template +void transformCoordinatesTest(string pTestFile) +{ + if (noDoubleTests()) return; + + vector inDims; + vector > in; + vector > gold; + + readTests(pTestFile, inDims, in, gold); + + af_array tfArray = 0; + af_array outArray = 0; + ASSERT_EQ(AF_SUCCESS, af_create_array(&tfArray, &(in[0].front()), inDims[0].ndims(), inDims[0].get(), (af_dtype)af::dtype_traits::af_type)); + + size_t nTests = in.size(); + + for (int test = 1; test < nTests; test++) { + dim_t d0 = (dim_t)in[test][0]; + dim_t d1 = (dim_t)in[test][1]; + + ASSERT_EQ(AF_SUCCESS, af_transform_coordinates(&outArray, tfArray, d0, d1)); + + // Get result + dim_t outEl = 0; + ASSERT_EQ(AF_SUCCESS, af_get_elements(&outEl, outArray)); + T* outData = new T[outEl]; + ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray)); + + const float thr = 1.f; + + for (size_t elIter = 0; elIter < outEl; elIter++) { + ASSERT_LE(fabs(outData[elIter] - gold[test-1][elIter]), thr) << "at: " << elIter << std::endl; + } + + delete[] outData; + } + + if(tfArray != 0) af_release_array(tfArray); + if(outArray != 0) af_release_array(outArray); +} + +TYPED_TEST(TransformCoordinates, RotateMatrix) +{ + transformCoordinatesTest(string(TEST_DIR"/transformCoordinates/rotate_matrix.test")); +} + +TYPED_TEST(TransformCoordinates, 3DMatrix) +{ + transformCoordinatesTest(string(TEST_DIR"/transformCoordinates/3d_matrix.test")); +} + +///////////////////////////////////// CPP //////////////////////////////// +// +TEST(TransformCoordinates, CPP) +{ + vector inDims; + vector > in; + vector > gold; + + readTests(TEST_DIR"/transformCoordinates/3d_matrix.test",inDims,in,gold); + + af::array tf = af::array(inDims[0][0], inDims[0][1], &(in[0].front())); + + float d0 = in[1][0]; + float d1 = in[1][1]; + + af::array out = af::transformCoordinates(tf, d0, d1); + + af::dim4 outDims = out.dims(); + + float* h_out = new float[outDims[0] * outDims[1]]; + out.host(h_out); + + const size_t n = gold[0].size(); + + const float thr = 1.f; + + for (size_t elIter = 0; elIter < n; elIter++) { + ASSERT_LE(fabs(h_out[elIter] - gold[0][elIter]), thr) << "at: " << elIter << std::endl; + } + + delete[] h_out; +} diff --git a/test/translate.cpp b/test/translate.cpp index 5b00c04ec8..355d30a553 100644 --- a/test/translate.cpp +++ b/test/translate.cpp @@ -20,6 +20,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/transpose.cpp b/test/transpose.cpp index 6be1ba49ab..8437a12615 100644 --- a/test/transpose.cpp +++ b/test/transpose.cpp @@ -17,6 +17,7 @@ using std::string; using std::vector; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/triangle.cpp b/test/triangle.cpp index e0b609b9ab..6322070226 100644 --- a/test/triangle.cpp +++ b/test/triangle.cpp @@ -23,6 +23,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; using af::dim4; diff --git a/test/where.cpp b/test/where.cpp index eb21e0d6dc..08ed878aea 100644 --- a/test/where.cpp +++ b/test/where.cpp @@ -78,17 +78,6 @@ void whereTest(string pTestFile, bool isSubRef=false, const vector seqv= if(tempArray != 0) af_release_array(tempArray); } -vector init_subs() -{ - vector subs; - subs.push_back(af_make_seq(2, 6, 1)); - subs.push_back(af_make_seq(1, 5, 1)); - subs.push_back(af_make_seq(1, 3, 1)); - subs.push_back(af_make_seq(1, 2, 1)); - return subs; -} - - #define WHERE_TESTS(T) \ TEST(Where,Test_##T) \ { \ @@ -132,3 +121,10 @@ TYPED_TEST(Where, CPP) << std::endl; } } + +TEST(Where, ISSUE_1259) +{ + af::array a = af::randu(10, 10, 10); + af::array indices = af::where(a > 2); + ASSERT_EQ(indices.elements(), 0); +} diff --git a/test/wrap.cpp b/test/wrap.cpp index 0cc6fab909..091c5341c1 100644 --- a/test/wrap.cpp +++ b/test/wrap.cpp @@ -23,6 +23,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; @@ -41,27 +42,27 @@ typedef ::testing::Types -double get_val(T val) +inline double get_val(T val) { return val; } -template<> double get_val(cfloat val) +template<> inline double get_val(cfloat val) { return abs(val); } -template<> double get_val(cdouble val) +template<> inline double get_val(cdouble val) { return abs(val); } -template<> double get_val(unsigned char val) +template<> inline double get_val(unsigned char val) { return ((int)(val)) % 256; } -template<> double get_val(char val) +template<> inline double get_val(char val) { return (val != 0); }