From be0ff221c0f3dac94e6c07fcd96e373c3edc7256 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Sun, 9 Aug 2015 01:18:10 -0400 Subject: [PATCH 001/288] Setup async project; Setup basic queues; --- src/backend/cpu/CMakeLists.txt | 14 ++++++++++++++ src/backend/cpu/copy.cpp | 3 +++ src/backend/cpu/platform.cpp | 11 ++++++++++- src/backend/cpu/platform.hpp | 4 ++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index ab1e0a685c..10a749b08d 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -47,12 +47,25 @@ IF(NOT UNIX) ADD_DEFINITIONS(-DAFDLL) ENDIF() +INCLUDE(ExternalProject) +ExternalProject_Add( + threads + PREFIX ${CMAKE_BINARY_DIR}/third_party/threads + GIT_REPOSITORY https://github.com/alltheflops/threads.git + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory + /threads ${CMAKE_BINARY_DIR}/third_party/threads + LOG_DOWNLOAD ON + LOG_INSTALL ON + ) INCLUDE_DIRECTORIES( ${CMAKE_INCLUDE_PATH} "${CMAKE_SOURCE_DIR}/src/backend/cpu" ${FFTW_INCLUDES} ${CBLAS_INCLUDE_DIR} ${LAPACK_INCLUDE_DIR} + ${CMAKE_BINARY_DIR}/third_party/threads/src/threads ) FILE(GLOB cpu_headers @@ -148,6 +161,7 @@ TARGET_LINK_LIBRARIES(afcpu PRIVATE ${CBLAS_LIBRARIES} PRIVATE ${FFTW_LIBRARIES}) +ADD_DEPENDENCIES(afcpu threads) IF(FORGE_FOUND AND NOT USE_SYSTEM_FORGE) ADD_DEPENDENCIES(afcpu forge) ENDIF() diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index a2bb4ff912..35c1ebe23f 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include namespace cpu { @@ -46,6 +48,7 @@ namespace cpu template void copyData(T *to, const Array &from) { + getQueue().sync(); if(from.isOwner()) { // FIXME: Check for errors / exceptions memcpy(to, from.get(), from.elements()*sizeof(T)); diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index d6b4724c25..73bd5875d0 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include namespace cpu { @@ -75,9 +77,16 @@ int getActiveDeviceId() return 0; } +static const int MAX_QUEUES = 1; + +async_queue& getQueue(int idx) { + static std::array queues; + return queues[idx]; +} + void sync(int device) { - // Nothing here + getQueue().sync(); } } diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp index e899837b8c..2bf6bf2a93 100644 --- a/src/backend/cpu/platform.hpp +++ b/src/backend/cpu/platform.hpp @@ -9,6 +9,8 @@ #include +class async_queue; + namespace cpu { std::string getInfo(); @@ -23,4 +25,6 @@ namespace cpu { int getActiveDeviceId(); void sync(int device); + + async_queue& getQueue(int idx = 0); } From b94c3df4e1c5288eb1bb985f372e3f8e5910fe3d Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Sun, 9 Aug 2015 01:19:34 -0400 Subject: [PATCH 002/288] Convert CPU blas to use async queues --- src/backend/cpu/blas.cpp | 73 ++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/src/backend/cpu/blas.cpp b/src/backend/cpu/blas.cpp index 0bbd39970f..8887202064 100644 --- a/src/backend/cpu/blas.cpp +++ b/src/backend/cpu/blas.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include namespace cpu { @@ -131,36 +133,38 @@ Array matmul(const Array &lhs, const Array &rhs, int N = rDims[bColDim]; int K = lDims[aColDim]; - //FIXME: Leaks on errors. - Array out = createEmptyArray(af::dim4(M, N, 1, 1)); - auto alpha = getScale(); - auto beta = getScale(); - - dim4 lStrides = lhs.strides(); - dim4 rStrides = rhs.strides(); using BT = typename blas_base::type; using CBT = const typename blas_base::type; - if(rDims[bColDim] == 1) { - N = lDims[aColDim]; - gemv_func()( - CblasColMajor, lOpts, - lDims[0], lDims[1], - alpha, - reinterpret_cast(lhs.get()), lStrides[1], - reinterpret_cast(rhs.get()), rStrides[0], - beta, - reinterpret_cast(out.get()), 1); - } else { - gemm_func()( - CblasColMajor, lOpts, rOpts, - M, N, K, - alpha, - reinterpret_cast(lhs.get()), lStrides[1], - reinterpret_cast(rhs.get()), rStrides[1], - beta, - reinterpret_cast(out.get()), out.dims()[0]); - } + Array out = createEmptyArray(af::dim4(M, N, 1, 1)); + auto func = [=] (Array output, const Array left, const Array right) { + auto alpha = getScale(); + auto beta = getScale(); + + dim4 lStrides = left.strides(); + dim4 rStrides = right.strides(); + + if(rDims[bColDim] == 1) { + gemv_func()( + CblasColMajor, lOpts, + lDims[0], lDims[1], + alpha, + reinterpret_cast(left.get()), lStrides[1], + reinterpret_cast(right.get()), rStrides[0], + beta, + reinterpret_cast(output.get()), 1); + } else { + gemm_func()( + CblasColMajor, lOpts, rOpts, + M, N, K, + alpha, + reinterpret_cast(left.get()), lStrides[1], + reinterpret_cast(right.get()), rStrides[1], + beta, + reinterpret_cast(output.get()), output.dims()[0]); + } + }; + getQueue().enqueue(func, out, lhs, rhs); return out; } @@ -172,7 +176,7 @@ template<> cfloat conj (cfloat c) { return std::conj(c); } template<> cdouble conj(cdouble c) { return std::conj(c); } template -Array dot_(const Array &lhs, const Array &rhs, +void dot_(Array output, const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { int N = lhs.dims()[0]; @@ -186,22 +190,25 @@ Array dot_(const Array &lhs, const Array &rhs, if(both_conjugate) out = cpu::conj(out); - return createValueArray(af::dim4(1), out); + *output.get() = out; + } template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { + Array out = createEmptyArray(af::dim4(1)); if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - return dot_(lhs, rhs, optLhs, optRhs); + getQueue().enqueue(dot_, out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - return dot_(lhs, rhs, optLhs, optRhs); + getQueue().enqueue(dot_,out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - return dot_(rhs, lhs, optRhs, optLhs); + getQueue().enqueue(dot_,out, rhs, lhs, optRhs, optLhs); } else { - return dot_(lhs, rhs, optLhs, optRhs); + getQueue().enqueue(dot_,out, lhs, rhs, optLhs, optRhs); } + return out; } #undef BT From 3188bdf56555ae36b24c41039d973a7d9835301f Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Sun, 9 Aug 2015 11:51:54 -0400 Subject: [PATCH 003/288] Async CPU approx1 and approx2 --- src/backend/cpu/approx.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp index 69b943a6e5..735edd4fd2 100644 --- a/src/backend/cpu/approx.cpp +++ b/src/backend/cpu/approx.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include namespace cpu { @@ -141,14 +143,14 @@ namespace cpu switch(method) { case AF_INTERP_NEAREST: - approx1_ - (out.get(), out.dims(), out.elements(), + getQueue().enqueue(approx1_, + out.get(), out.dims(), out.elements(), in.get(), in.dims(), in.elements(), pos.get(), pos.dims(), out.strides(), in.strides(), pos.strides(), offGrid); break; case AF_INTERP_LINEAR: - approx1_ - (out.get(), out.dims(), out.elements(), + getQueue().enqueue(approx1_, + out.get(), out.dims(), out.elements(), in.get(), in.dims(), in.elements(), pos.get(), pos.dims(), out.strides(), in.strides(), pos.strides(), offGrid); break; @@ -304,16 +306,16 @@ namespace cpu switch(method) { case AF_INTERP_NEAREST: - approx2_ - (out.get(), out.dims(), out.elements(), + getQueue().enqueue(approx2_, + out.get(), out.dims(), out.elements(), in.get(), in.dims(), in.elements(), pos0.get(), pos0.dims(), pos1.get(), pos1.dims(), out.strides(), in.strides(), pos0.strides(), pos1.strides(), offGrid); break; case AF_INTERP_LINEAR: - approx2_ - (out.get(), out.dims(), out.elements(), + getQueue().enqueue(approx2_, + out.get(), out.dims(), out.elements(), in.get(), in.dims(), in.elements(), pos0.get(), pos0.dims(), pos1.get(), pos1.dims(), out.strides(), in.strides(), pos0.strides(), pos1.strides(), From f797314daf6ac0ff8f630437ecf276b274b86a36 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Sun, 9 Aug 2015 12:20:39 -0400 Subject: [PATCH 004/288] Async CPU Assign --- src/backend/cpu/assign.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp index a8ac33ece0..c0a177f5e2 100644 --- a/src/backend/cpu/assign.cpp +++ b/src/backend/cpu/assign.cpp @@ -14,8 +14,11 @@ #include #include #include +#include +#include using af::dim4; +using std::ref; namespace cpu { @@ -34,7 +37,7 @@ dim_t trimIndex(int idx, const dim_t &len) } template -void assign(Array& out, const af_index_t idxrs[], const Array& rhs) +void assign_(Array& out, const af_index_t idxrs[], const Array& rhs) { bool isSeq[4]; std::vector seqs(4, af_span); @@ -111,6 +114,12 @@ void assign(Array& out, const af_index_t idxrs[], const Array& rhs) } } +template +void assign(Array& out, const af_index_t idxrs[], const Array& rhs) +{ + getQueue().enqueue(assign_, ref(out), idxrs, ref(rhs)); +} + #define INSTANTIATE(T) \ template void assign(Array& out, const af_index_t idxrs[], const Array& rhs); From c8ecdb92c5607b76cf108fb3dd7784ef8a696641 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Sun, 9 Aug 2015 16:21:30 -0400 Subject: [PATCH 005/288] Async CPU Bilateral --- src/backend/cpu/bilateral.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp index d8ef7c61cb..446b8a0c17 100644 --- a/src/backend/cpu/bilateral.cpp +++ b/src/backend/cpu/bilateral.cpp @@ -14,8 +14,11 @@ #include #include #include +#include +#include using af::dim4; +using std::ref; namespace cpu { @@ -35,12 +38,11 @@ static inline unsigned getIdx(const dim4 &strides, } template -Array bilateral(const Array &in, const float &s_sigma, const float &c_sigma) +void bilateral_(Array out, const Array &in, float s_sigma, float c_sigma) { const dim4 dims = in.dims(); const dim4 istrides = in.strides(); - Array out = createEmptyArray(dims); const dim4 ostrides = out.strides(); outType *outData = out.get(); @@ -93,7 +95,14 @@ Array bilateral(const Array &in, const float &s_sigma, const fl inData += istrides[2]; } } +} +template +Array bilateral(const Array &in, const float &s_sigma, const float &c_sigma) +{ + const dim4 dims = in.dims(); + Array out = createEmptyArray(dims); + getQueue().enqueue(bilateral_, out, ref(in), s_sigma, c_sigma); return out; } From 759b506fed2406bac094ec26b4fad293cd09f0e9 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Mon, 10 Aug 2015 22:08:55 -0400 Subject: [PATCH 006/288] Async CPU Convolve --- src/backend/cpu/convolve.cpp | 47 +++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp index 33670d47cc..a5d7ded17c 100644 --- a/src/backend/cpu/convolve.cpp +++ b/src/backend/cpu/convolve.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include using af::dim4; @@ -204,8 +206,8 @@ Array convolve(Array const& signal, Array const& filter, ConvolveBat Array out = createEmptyArray(oDims); - convolve_nd(out.get(), signal.get(), filter.get(), - oDims, sDims, fDims, out.strides(), sStrides, filter.strides(), kind); + getQueue().enqueue(convolve_nd,out.get(), signal.get(), filter.get(), + oDims, sDims, fDims, out.strides(), sStrides, filter.strides(), kind); return out; } @@ -271,32 +273,37 @@ Array convolve2(Array const& signal, Array const& c_filter, Array temp = createEmptyArray(tDims); Array out = createEmptyArray(oDims); - auto tStrides = temp.strides(); - auto oStrides = out.strides(); - for (dim_t b3=0; b3 out) { + Array temp = createEmptyArray(tDims); + auto tStrides = temp.strides(); + auto oStrides = out.strides(); - dim_t i_b3Off = b3*sStrides[3]; - dim_t t_b3Off = b3*tStrides[3]; - dim_t o_b3Off = b3*oStrides[3]; + for (dim_t b3=0; b3(tptr, iptr, c_filter.get(), - tDims, sDims, sDims, cflen, - tStrides, sStrides, c_filter.strides()[0]); + T const *iptr = signal.get()+ b2*sStrides[2] + i_b3Off; + T *tptr = temp.get() + b2*tStrides[2] + t_b3Off; + T *optr = out.get() + b2*oStrides[2] + o_b3Off; - convolve2_separable(optr, tptr, r_filter.get(), - oDims, tDims, sDims, rflen, - oStrides, tStrides, r_filter.strides()[0]); + convolve2_separable(tptr, iptr, c_filter.get(), + tDims, sDims, sDims, cflen, + tStrides, sStrides, c_filter.strides()[0]); + + convolve2_separable(optr, tptr, r_filter.get(), + oDims, tDims, sDims, rflen, + oStrides, tStrides, r_filter.strides()[0]); + } } - } + }; + + getQueue().enqueue(func, out); return out; } From c399e751cf7c824e0df575cf266ea7bb13645905 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Mon, 10 Aug 2015 23:25:04 -0400 Subject: [PATCH 007/288] Async CPU diff1 and diff2 --- src/backend/cpu/diff.cpp | 91 ++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp index 907c111c0b..08c1a66ac2 100644 --- a/src/backend/cpu/diff.cpp +++ b/src/backend/cpu/diff.cpp @@ -11,6 +11,8 @@ #include #include #include +#include +#include namespace cpu { @@ -36,28 +38,31 @@ namespace cpu dims[dim]--; // Create output placeholder - Array outArray = createValueArray(dims, (T)0); - - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - // TODO: Improve this - for(dim_t l = 0; l < dims[3]; l++) { - for(dim_t k = 0; k < dims[2]; k++) { - for(dim_t j = 0; j < dims[1]; j++) { - for(dim_t i = 0; i < dims[0]; i++) { - // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), - i + is_dim0, j + is_dim1, - k + is_dim2, l + is_dim3); - int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); - outPtr[odx] = inPtr[jdx] - inPtr[idx]; + Array outArray = createEmptyArray(dims); + + auto func = [=] (Array outArray, Array in) { + // Get pointers to raw data + const T *inPtr = in.get(); + T *outPtr = outArray.get(); + + // TODO: Improve this + for(dim_t l = 0; l < dims[3]; l++) { + for(dim_t k = 0; k < dims[2]; k++) { + for(dim_t j = 0; j < dims[1]; j++) { + for(dim_t i = 0; i < dims[0]; i++) { + // Operation: out[index] = in[index + 1 * dim_size] - in[index] + int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); + int jdx = getIdx(in.strides(), in.offsets(), + i + is_dim0, j + is_dim1, + k + is_dim2, l + is_dim3); + int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); + outPtr[odx] = inPtr[jdx] - inPtr[idx]; + } } } } - } + }; + getQueue().enqueue(func, outArray, in); return outArray; } @@ -76,31 +81,35 @@ namespace cpu dims[dim] -= 2; // Create output placeholder - Array outArray = createValueArray(dims, (T)0); - - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - // TODO: Improve this - for(dim_t l = 0; l < dims[3]; l++) { - for(dim_t k = 0; k < dims[2]; k++) { - for(dim_t j = 0; j < dims[1]; j++) { - for(dim_t i = 0; i < dims[0]; i++) { - // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), - i + is_dim0, j + is_dim1, - k + is_dim2, l + is_dim3); - int kdx = getIdx(in.strides(), in.offsets(), - i + 2 * is_dim0, j + 2 * is_dim1, - k + 2 * is_dim2, l + 2 * is_dim3); - int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); - outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; + Array outArray = createEmptyArray(dims); + + auto func = [=] (Array outArray, Array in) { + // Get pointers to raw data + const T *inPtr = in.get(); + T *outPtr = outArray.get(); + + // TODO: Improve this + for(dim_t l = 0; l < dims[3]; l++) { + for(dim_t k = 0; k < dims[2]; k++) { + for(dim_t j = 0; j < dims[1]; j++) { + for(dim_t i = 0; i < dims[0]; i++) { + // Operation: out[index] = in[index + 1 * dim_size] - in[index] + int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); + int jdx = getIdx(in.strides(), in.offsets(), + i + is_dim0, j + is_dim1, + k + is_dim2, l + is_dim3); + int kdx = getIdx(in.strides(), in.offsets(), + i + 2 * is_dim0, j + 2 * is_dim1, + k + 2 * is_dim2, l + 2 * is_dim3); + int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); + outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; + } } } } - } + }; + + getQueue().enqueue(func, outArray, in); return outArray; } From 80903d062962fcbb215c3cc6ffb12b18457f73be Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Thu, 13 Aug 2015 17:24:13 -0400 Subject: [PATCH 008/288] Avoid sending references to queued lambdas --- src/backend/cpu/bilateral.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp index 446b8a0c17..c826ef67be 100644 --- a/src/backend/cpu/bilateral.cpp +++ b/src/backend/cpu/bilateral.cpp @@ -18,7 +18,6 @@ #include using af::dim4; -using std::ref; namespace cpu { @@ -38,15 +37,15 @@ static inline unsigned getIdx(const dim4 &strides, } template -void bilateral_(Array out, const Array &in, float s_sigma, float c_sigma) +void bilateral_(Array out, const Array in, float s_sigma, float c_sigma) { const dim4 dims = in.dims(); const dim4 istrides = in.strides(); const dim4 ostrides = out.strides(); - outType *outData = out.get(); - const inType * inData = in.get(); + outType *outData = out.get(); + const inType *inData = in.get(); // clamp spatical and chromatic sigma's float space_ = std::min(11.5f, std::max(s_sigma, 0.f)); @@ -102,7 +101,7 @@ Array bilateral(const Array &in, const float &s_sigma, const fl { const dim4 dims = in.dims(); Array out = createEmptyArray(dims); - getQueue().enqueue(bilateral_, out, ref(in), s_sigma, c_sigma); + getQueue().enqueue(bilateral_, out, in, s_sigma, c_sigma); return out; } From 96c5602965334c5f36f33dc69ad81314fd6e6bd7 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Thu, 13 Aug 2015 17:25:31 -0400 Subject: [PATCH 009/288] Async CPU Copy, Assign, and Index --- src/backend/cpu/assign.cpp | 9 ++++-- src/backend/cpu/copy.cpp | 6 ++-- src/backend/cpu/index.cpp | 56 ++++++++++++++++++++++---------------- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp index c0a177f5e2..589fa537f5 100644 --- a/src/backend/cpu/assign.cpp +++ b/src/backend/cpu/assign.cpp @@ -16,9 +16,12 @@ #include #include #include +#include using af::dim4; using std::ref; +using std::copy; +using std::array; namespace cpu { @@ -37,7 +40,7 @@ dim_t trimIndex(int idx, const dim_t &len) } template -void assign_(Array& out, const af_index_t idxrs[], const Array& rhs) +void assign_(Array out, const array idxrs, const Array rhs) { bool isSeq[4]; std::vector seqs(4, af_span); @@ -117,7 +120,9 @@ void assign_(Array& out, const af_index_t idxrs[], const Array& rhs) template void assign(Array& out, const af_index_t idxrs[], const Array& rhs) { - getQueue().enqueue(assign_, ref(out), idxrs, ref(rhs)); + array idx; + copy(idxrs, idxrs+4, begin(idx)); + getQueue().enqueue(assign_, out, move(idx), rhs); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 35c1ebe23f..433e7186bd 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -117,7 +117,7 @@ namespace cpu template void multiply_inplace(Array &in, double val) { - copy(in, in, 0, val); + getQueue().enqueue(copy,in, in, 0, val); } template @@ -126,14 +126,14 @@ namespace cpu outType default_value, double factor) { Array ret = createValueArray(dims, default_value); - copy(ret, in, outType(default_value), factor); + getQueue().enqueue(copy,ret, in, outType(default_value), factor); return ret; } template void copyArray(Array &out, Array const &in) { - copy(out, in, scalar(0), 1.0); + getQueue().enqueue(copy,out, in, scalar(0), 1.0); } diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp index 162e67fb46..c6112fa6c8 100644 --- a/src/backend/cpu/index.cpp +++ b/src/backend/cpu/index.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include using af::dim4; @@ -68,43 +70,49 @@ Array index(const Array& in, const af_index_t idxrs[]) Array out = createEmptyArray(oDims); dim4 oStrides= out.strides(); - const T *src = in.get(); - T *dst = out.get(); - const uint* ptr0 = idxArrs[0].get(); - const uint* ptr1 = idxArrs[1].get(); - const uint* ptr2 = idxArrs[2].get(); - const uint* ptr3 = idxArrs[3].get(); + auto func = [=] (Array out, const Array in) { - for (dim_t l=0; l Date: Thu, 13 Aug 2015 17:26:08 -0400 Subject: [PATCH 010/288] Async CPU diagonal --- src/backend/cpu/diagonal.cpp | 55 +++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp index 2ae69a6901..c2e7e92e17 100644 --- a/src/backend/cpu/diagonal.cpp +++ b/src/backend/cpu/diagonal.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include namespace cpu { @@ -24,22 +26,25 @@ namespace cpu int batch = in.dims()[1]; Array out = createEmptyArray(dim4(size, size, batch)); - const T *iptr = in.get(); - T *optr = out.get(); + auto func = [=] (Array out, const Array in) { + const T *iptr = in.get(); + T *optr = out.get(); - for (int k = 0; k < batch; k++) { - for (int j = 0; j < size; j++) { - for (int i = 0; i < size; i++) { - T val = scalar(0); - if (i == j - num) { - val = (num > 0) ? iptr[i] : iptr[j]; + for (int k = 0; k < batch; k++) { + for (int j = 0; j < size; j++) { + for (int i = 0; i < size; i++) { + T val = scalar(0); + if (i == j - num) { + val = (num > 0) ? iptr[i] : iptr[j]; + } + optr[i + j * out.strides()[1]] = val; } - optr[i + j * out.strides()[1]] = val; } + optr += out.strides()[2]; + iptr += in.strides()[1]; } - optr += out.strides()[2]; - iptr += in.strides()[1]; - } + }; + getQueue().enqueue(func, out, in); return out; } @@ -51,23 +56,27 @@ namespace cpu dim_t size = std::max(idims[0], idims[1]) - std::abs(num); Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); - const dim_t *odims = out.dims().get(); + auto func = [=] (Array out, const Array in) { + const dim_t *odims = out.dims().get(); - const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); + const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); - for (int l = 0; l < (int)odims[3]; l++) { + for (int l = 0; l < (int)odims[3]; l++) { - for (int k = 0; k < (int)odims[2]; k++) { - const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; - T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; + for (int k = 0; k < (int)odims[2]; k++) { + const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; + T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; - for (int i = 0; i < (int)odims[0]; i++) { - T val = scalar(0); - if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; - optr[i] = val; + for (int i = 0; i < (int)odims[0]; i++) { + T val = scalar(0); + if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; + optr[i] = val; + } } } - } + }; + + getQueue().enqueue(func, out, in); return out; } From b7c83e800e7e5ef714b9ac63e82b79f818e3b965 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Mon, 21 Sep 2015 14:36:05 -0400 Subject: [PATCH 011/288] Async FFT for the CPU backend --- src/backend/cpu/Array.cpp | 48 +++++++++++++++++++------------------ src/backend/cpu/copy.cpp | 10 ++++---- src/backend/cpu/fft.cpp | 39 ++++++++++++++++++++++-------- src/backend/cpu/reorder.cpp | 22 ++++++++++------- 4 files changed, 74 insertions(+), 45 deletions(-) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 15515fa7b5..d714fd9682 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -46,7 +47,6 @@ namespace cpu } } - template Array::Array(af::dim4 dims, TNJ::Node_ptr n) : info(-1, dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), @@ -67,40 +67,42 @@ namespace cpu template void Array::eval() { - if (isReady()) return; + auto func = [this] { + if (isReady()) return; - this->setId(getActiveDeviceId()); - data = std::shared_ptr(memAlloc(elements()), memFree); - T *ptr = data.get(); + setId(getActiveDeviceId()); + data = std::shared_ptr(memAlloc(elements()), memFree); + T *ptr = data.get(); - dim4 ostrs = strides(); - dim4 odims = dims(); + dim4 ostrs = strides(); + dim4 odims = dims(); - for (int w = 0; w < (int)odims[3]; w++) { - dim_t offw = w * ostrs[3]; + for (int w = 0; w < (int)odims[3]; w++) { + dim_t offw = w * ostrs[3]; - for (int z = 0; z < (int)odims[2]; z++) { - dim_t offz = z * ostrs[2] + offw; + for (int z = 0; z < (int)odims[2]; z++) { + dim_t offz = z * ostrs[2] + offw; - for (int y = 0; y < (int)odims[1]; y++) { - dim_t offy = y * ostrs[1] + offz; + for (int y = 0; y < (int)odims[1]; y++) { + dim_t offy = y * ostrs[1] + offz; - for (int x = 0; x < (int)odims[0]; x++) { - dim_t id = x + offy; + for (int x = 0; x < (int)odims[0]; x++) { + dim_t id = x + offy; - ptr[id] = *(T *)node->calc(x, y, z, w); + ptr[id] = *(T *)node->calc(x, y, z, w); + } } } } - } - - ready = true; + ready = true; + Node_ptr prev = node; + prev->reset(); + // FIXME: Replace the current node in any JIT possible trees with the new BufferNode + node.reset(); + }; - Node_ptr prev = node; - prev->reset(); - // FIXME: Replace the current node in any JIT possible trees with the new BufferNode - node.reset(); + getQueue().enqueue(func); } template diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 433e7186bd..58773afbaa 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -67,7 +67,7 @@ namespace cpu } template - static void copy(Array &dst, const Array &src, outType default_value, double factor) + static void copy(Array dst, const Array src, outType default_value, double factor) { dim4 src_dims = src.dims(); dim4 dst_dims = dst.dims(); @@ -117,7 +117,7 @@ namespace cpu template void multiply_inplace(Array &in, double val) { - getQueue().enqueue(copy,in, in, 0, val); + getQueue().enqueue(copy, in, in, 0, val); } template @@ -126,14 +126,16 @@ namespace cpu outType default_value, double factor) { Array ret = createValueArray(dims, default_value); - getQueue().enqueue(copy,ret, in, outType(default_value), factor); + ret.eval(); + getQueue().sync(); + getQueue().enqueue(copy, ret, in, outType(default_value), factor); return ret; } template void copyArray(Array &out, Array const &in) { - getQueue().enqueue(copy,out, in, scalar(0), 1.0); + getQueue().enqueue(copy, out, in, scalar(0), 1.0); } diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp index e41c8a1658..7262e6dd78 100644 --- a/src/backend/cpu/fft.cpp +++ b/src/backend/cpu/fft.cpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include using af::dim4; @@ -52,7 +54,7 @@ TRANSFORM(fftwf, cfloat) TRANSFORM(fftw, cdouble) template -void fft_inplace(Array &in) +void fft_inplace_(Array in) { int t_dims[rank]; int in_embed[rank]; @@ -90,6 +92,12 @@ void fft_inplace(Array &in) transform.destroy(plan); } +template +void fft_inplace(Array &in) +{ + getQueue().enqueue(fft_inplace_, in); +} + template struct fftw_real_transform; @@ -114,14 +122,9 @@ TRANSFORM_REAL(fftwf, float , cfloat , c2r) TRANSFORM_REAL(fftw , double, cdouble, c2r) template -Array fft_r2c(const Array &in) +void fft_r2c_(Array out, const Array in) { dim4 idims = in.dims(); - dim4 odims = in.dims(); - - odims[0] = odims[0] / 2 + 1; - - Array out = createEmptyArray(odims); int t_dims[rank]; int in_embed[rank]; @@ -157,15 +160,23 @@ Array fft_r2c(const Array &in) transform.execute(plan); transform.destroy(plan); +} + +template +Array fft_r2c(const Array &in) +{ + dim4 odims = in.dims(); + odims[0] = odims[0] / 2 + 1; + Array out = createEmptyArray(odims); + + getQueue().enqueue(fft_r2c_, out, in); return out; } template -Array fft_c2r(const Array &in, const dim4 &odims) +void fft_c2r_(Array out, const Array in, const dim4 odims) { - Array out = createEmptyArray(odims); - int t_dims[rank]; int in_embed[rank]; int out_embed[rank]; @@ -200,6 +211,14 @@ Array fft_c2r(const Array &in, const dim4 &odims) transform.execute(plan); transform.destroy(plan); +} + +template +Array fft_c2r(const Array &in, const dim4 &odims) +{ + Array out = createEmptyArray(odims); + getQueue().enqueue(fft_c2r_, out, in, odims); + return out; } diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp index 42da24e435..5e1cd8fbca 100644 --- a/src/backend/cpu/reorder.cpp +++ b/src/backend/cpu/reorder.cpp @@ -11,19 +11,14 @@ #include #include #include +#include +#include namespace cpu { template - Array reorder(const Array &in, const af::dim4 &rdims) + void reorder_(Array out, const Array in, const af::dim4 oDims, const af::dim4 rdims) { - const af::dim4 iDims = in.dims(); - af::dim4 oDims(0); - for(int i = 0; i < 4; i++) - oDims[i] = iDims[rdims[i]]; - - Array out = createEmptyArray(oDims); - T* outPtr = out.get(); const T* inPtr = in.get(); @@ -53,7 +48,18 @@ namespace cpu } } } + } + template + Array reorder(const Array &in, const af::dim4 &rdims) + { + const af::dim4 iDims = in.dims(); + af::dim4 oDims(0); + for(int i = 0; i < 4; i++) + oDims[i] = iDims[rdims[i]]; + + Array out = createEmptyArray(oDims); + getQueue().enqueue(reorder_, out, in, oDims, rdims); return out; } From 413eea8f8c4abe6850b561c59ebe1e1a0f361a6f Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Mon, 21 Sep 2015 16:53:06 -0400 Subject: [PATCH 012/288] Add eval to copyData --- src/backend/cpu/copy.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 58773afbaa..3be201b893 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -48,6 +48,7 @@ namespace cpu template void copyData(T *to, const Array &from) { + evalArray(from); getQueue().sync(); if(from.isOwner()) { // FIXME: Check for errors / exceptions From 49f0cce2f145385b1908dd95faf6b048a3327d01 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Mon, 21 Sep 2015 16:56:21 -0400 Subject: [PATCH 013/288] Async random on CPU backend --- src/backend/cpu/random.cpp | 87 +++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp index 4c91b96fb1..7ecf272d41 100644 --- a/src/backend/cpu/random.cpp +++ b/src/backend/cpu/random.cpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include namespace cpu { @@ -74,7 +76,7 @@ static bool is_first = true; #define GLOBAL 1 template -Array randn(const af::dim4 &dims) +void randn_(Array out) { static unsigned long long my_seed = 0; if (is_first) { @@ -89,16 +91,22 @@ Array randn(const af::dim4 &dims) my_seed = gen_seed; } - Array outArray = createEmptyArray(dims); - T *outPtr = outArray.get(); - for (int i = 0; i < (int)outArray.elements(); i++) { + T *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { outPtr[i] = gen(); } +} + +template +Array randn(const af::dim4 &dims) +{ + Array outArray = createEmptyArray(dims); + getQueue().enqueue(randn_, outArray); return outArray; } template -Array randu(const af::dim4 &dims) +void randu_(Array out) { static unsigned long long my_seed = 0; if (is_first) { @@ -113,11 +121,39 @@ Array randu(const af::dim4 &dims) my_seed = gen_seed; } - Array outArray = createEmptyArray(dims); - T *outPtr = outArray.get(); - for (int i = 0; i < (int)outArray.elements(); i++) { + T *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { outPtr[i] = gen(); } +} + +template<> +void randu_(Array out) +{ + static unsigned long long my_seed = 0; + if (is_first) { + setSeed(gen_seed); + my_seed = gen_seed; + } + + static auto gen = urand(generator); + + if (my_seed != gen_seed) { + gen = urand(generator); + my_seed = gen_seed; + } + + char *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { + outPtr[i] = gen() > 0.5; + } +} + +template +Array randu(const af::dim4 &dims) +{ + Array outArray = createEmptyArray(dims); + getQueue().enqueue(randu_, outArray); return outArray; } @@ -133,6 +169,7 @@ INSTANTIATE_UNIFORM(uint) INSTANTIATE_UNIFORM(intl) INSTANTIATE_UNIFORM(uintl) INSTANTIATE_UNIFORM(uchar) +INSTANTIATE_UNIFORM(char) #define INSTANTIATE_NORMAL(T) \ template Array randn(const af::dim4 &dims); @@ -143,39 +180,19 @@ INSTANTIATE_NORMAL(cfloat) INSTANTIATE_NORMAL(cdouble) -template<> -Array randu(const af::dim4 &dims) -{ - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; - } - - static auto gen = urand(generator); - - if (my_seed != gen_seed) { - gen = urand(generator); - my_seed = gen_seed; - } - - Array outArray = createEmptyArray(dims); - char *outPtr = outArray.get(); - for (int i = 0; i < (int)outArray.elements(); i++) { - outPtr[i] = gen() > 0.5; - } - return outArray; -} - void setSeed(const uintl seed) { - generator.seed(seed); - is_first = false; - gen_seed = seed; + auto f = [=](const uintl seed){ + generator.seed(seed); + is_first = false; + gen_seed = seed; + }; + getQueue().enqueue(f, seed); } uintl getSeed() { + getQueue().sync(); return gen_seed; } From fada8833549ff6833143ab79dcc8f4f35e5eaa17 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Tue, 22 Sep 2015 09:03:58 -0400 Subject: [PATCH 014/288] Async where on the CPU backe --- src/backend/cpu/where.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp index c1ffd0f973..c5102c8c61 100644 --- a/src/backend/cpu/where.cpp +++ b/src/backend/cpu/where.cpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include using af::dim4; @@ -24,6 +26,9 @@ namespace cpu template Array where(const Array &in) { + evalArray(in); + getQueue().sync(); + const dim_t *dims = in.dims().get(); const dim_t *strides = in.strides().get(); static const T zero = scalar(0); From 1a0802fb2fe22930e81613faa340038b68e0e2e2 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Tue, 22 Sep 2015 13:05:17 -0400 Subject: [PATCH 015/288] Async CPU reduce and ireduce --- src/backend/cpu/ireduce.cpp | 42 +++++++++++++++---------------------- src/backend/cpu/reduce.cpp | 15 ++++++++----- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp index 199a0befb3..d3a76d92f3 100644 --- a/src/backend/cpu/ireduce.cpp +++ b/src/backend/cpu/ireduce.cpp @@ -14,6 +14,9 @@ #include #include +#include +#include + using af::dim4; namespace cpu @@ -105,43 +108,32 @@ namespace cpu } }; + template + using ireduce_dim_func = std::function; + template void ireduce(Array &out, Array &loc, const Array &in, const int dim) { dim4 odims = in.dims(); odims[dim] = 1; + static const ireduce_dim_func ireduce_funcs[] = { ireduce_dim() + , ireduce_dim() + , ireduce_dim() + , ireduce_dim()}; - switch (in.ndims()) { - case 1: - ireduce_dim()(out.get(), out.strides(), out.dims(), - loc.get(), - in.get(), in.strides(), in.dims(), dim); - break; - - case 2: - ireduce_dim()(out.get(), out.strides(), out.dims(), - loc.get(), - in.get(), in.strides(), in.dims(), dim); - break; - - case 3: - ireduce_dim()(out.get(), out.strides(), out.dims(), - loc.get(), - in.get(), in.strides(), in.dims(), dim); - break; - - case 4: - ireduce_dim()(out.get(), out.strides(), out.dims(), - loc.get(), - in.get(), in.strides(), in.dims(), dim); - break; - } + getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out.get(), out.strides(), out.dims(), + loc.get(), in.get(), in.strides(), in.dims(), dim); } template T ireduce_all(unsigned *loc, const Array &in) { + evalArray(in); + getQueue().sync(); af::dim4 dims = in.dims(); af::dim4 strides = in.strides(); const T *inPtr = in.get(); diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp index 5724508be6..8ce7d0de28 100644 --- a/src/backend/cpu/reduce.cpp +++ b/src/backend/cpu/reduce.cpp @@ -16,6 +16,9 @@ #include #include +#include +#include + using af::dim4; namespace cpu @@ -74,12 +77,12 @@ namespace cpu odims[dim] = 1; Array out = createEmptyArray(odims); - static reduce_dim_func reduce_funcs[4] = { reduce_dim() - , reduce_dim() - , reduce_dim() - , reduce_dim()}; + static const reduce_dim_func reduce_funcs[4] = { reduce_dim() + , reduce_dim() + , reduce_dim() + , reduce_dim()}; - reduce_funcs[in.ndims() - 1](out.get(), out.strides(), out.dims(), + getQueue().enqueue(reduce_funcs[in.ndims() - 1],out.get(), out.strides(), out.dims(), in.get(), in.strides(), in.dims(), dim, change_nan, nanval); @@ -89,6 +92,8 @@ namespace cpu template To reduce_all(const Array &in, bool change_nan, double nanval) { + evalArray(in); + getQueue().sync(); Transform transform; Binary reduce; From 1842bcf50998ca83b9fc5a94f7823a1e6f5aade8 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Tue, 22 Sep 2015 15:14:55 -0400 Subject: [PATCH 016/288] Async CPU Transpose. Fix bug in eval --- src/backend/cpu/Array.cpp | 9 +++++---- src/backend/cpu/transpose.cpp | 30 +++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index d714fd9682..64aacf5fd6 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -67,11 +67,12 @@ namespace cpu template void Array::eval() { - auto func = [this] { - if (isReady()) return; + if (isReady()) return; + data = std::shared_ptr(memAlloc(elements()), memFree); + + auto func = [this] { setId(getActiveDeviceId()); - data = std::shared_ptr(memAlloc(elements()), memFree); T *ptr = data.get(); dim4 ostrs = strides(); @@ -95,13 +96,13 @@ namespace cpu } } - ready = true; Node_ptr prev = node; prev->reset(); // FIXME: Replace the current node in any JIT possible trees with the new BufferNode node.reset(); }; + ready = true; getQueue().enqueue(func); } diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp index f820f9ea5d..4afbfaae8e 100644 --- a/src/backend/cpu/transpose.cpp +++ b/src/backend/cpu/transpose.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -78,15 +80,8 @@ void transpose_(T *out, const T *in, const af::dim4 &odims, const af::dim4 &idim } template -Array transpose(const Array &in, const bool conjugate) +void transpose_(Array out, const Array in, const bool conjugate) { - const dim4 inDims = in.dims(); - - dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]); - - // create an array with first two dimensions swapped - Array out = createEmptyArray(outDims); - // get data pointers for input and output Arrays T* outData = out.get(); const T* inData = in.get(); @@ -98,7 +93,18 @@ Array transpose(const Array &in, const bool conjugate) transpose_(outData, inData, out.dims(), in.dims(), out.strides(), in.strides()); } +} + +template +Array transpose(const Array &in, const bool conjugate) +{ + const dim4 inDims = in.dims(); + + dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]); + // create an array with first two dimensions swapped + Array out = createEmptyArray(outDims); + getQueue().enqueue(transpose_, out, in, conjugate); return out; } @@ -133,7 +139,7 @@ void transpose_inplace(T *in, const af::dim4 &idims, const af::dim4 &istrides) } template -void transpose_inplace(Array &in, const bool conjugate) +void transpose_inplace_(Array in, const bool conjugate) { // get data pointers for input and output Arrays T* inData = in.get(); @@ -145,6 +151,12 @@ void transpose_inplace(Array &in, const bool conjugate) } } +template +void transpose_inplace(Array &in, const bool conjugate) +{ + getQueue().enqueue(transpose_inplace_, in, conjugate); +} + #define INSTANTIATE(T) \ template Array transpose(const Array &in, const bool conjugate); \ template void transpose_inplace(Array &in, const bool conjugate); From 91f7a1ffe531a82e3f7a9af7fdff6efe3fb6c5f9 Mon Sep 17 00:00:00 2001 From: pradeep Date: Tue, 17 Nov 2015 17:50:00 -0500 Subject: [PATCH 017/288] async cpu::index function --- src/backend/cpu/Array.cpp | 2 -- src/backend/cpu/index.cpp | 38 +++++++++++++++++++++----------------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 8ea6104a55..b612c7b45c 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -72,8 +72,6 @@ namespace cpu this->setId(getActiveDeviceId()); - if (isReady()) return; - data = std::shared_ptr(memAlloc(elements()), memFree); auto func = [] (Array in) { diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp index cad79f7d4e..39502e9683 100644 --- a/src/backend/cpu/index.cpp +++ b/src/backend/cpu/index.cpp @@ -17,6 +17,7 @@ #include #include #include +#include using af::dim4; @@ -50,12 +51,8 @@ Array index(const Array& in, const af_index_t idxrs[]) isSeq[x] = idxrs[x].isSeq; } - // rettrieve - dim4 iDims = in.dims(); - dim4 dDims = in.getDataDims(); - dim4 oDims = toDims (seqs, iDims); - dim4 iOffs = toOffset(seqs, dDims); - dim4 iStrds= toStride(seqs, dDims); + // retrieve + dim4 oDims = toDims(seqs, in.dims()); std::vector< Array > idxArrs(4, createEmptyArray(dim4())); // look through indexs to read af_array indexs @@ -68,18 +65,25 @@ Array index(const Array& in, const af_index_t idxrs[]) } Array out = createEmptyArray(oDims); - dim4 oStrides= out.strides(); - auto func = [=] (Array out, const Array in) { - - const T *src = in.get(); - T *dst = out.get(); - - const uint* ptr0 = idxArrs[0].get(); - const uint* ptr1 = idxArrs[1].get(); - const uint* ptr2 = idxArrs[2].get(); - const uint* ptr3 = idxArrs[3].get(); + auto func = [=] (Array out, const Array in, + const bool isSeq[], + const std::vector seqs, + const std::vector< Array > idxArrs) { + + const dim4 iDims = in.dims(); + const dim4 dDims = in.getDataDims(); + const dim4 iOffs = toOffset(seqs, dDims); + const dim4 iStrds = toStride(seqs, dDims); + const dim4 oDims = out.dims(); + const dim4 oStrides = out.strides(); + const T *src = in.get(); + T *dst = out.get(); + const uint* ptr0 = idxArrs[0].get(); + const uint* ptr1 = idxArrs[1].get(); + const uint* ptr2 = idxArrs[2].get(); + const uint* ptr3 = idxArrs[3].get(); for (dim_t l=0; l index(const Array& in, const af_index_t idxrs[]) } }; - getQueue().enqueue(func, out, in); + getQueue().enqueue(func, out, in, std::move(isSeq), std::move(seqs), std::move(idxArrs)); return out; } From 2796770f1689358892fde35c7303fa406f1ceb78 Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 19 Nov 2015 15:38:27 -0500 Subject: [PATCH 018/288] Fixed asynchronous indexing & assignment in cpu backend --- src/backend/cpu/assign.cpp | 100 ++++++++++++++++++------------------- src/backend/cpu/index.cpp | 18 +++---- 2 files changed, 57 insertions(+), 61 deletions(-) diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp index b75b6d549c..b1578d49f6 100644 --- a/src/backend/cpu/assign.cpp +++ b/src/backend/cpu/assign.cpp @@ -16,12 +16,11 @@ #include #include #include -#include using af::dim4; using std::ref; using std::copy; -using std::array; +using std::vector; namespace cpu { @@ -40,12 +39,11 @@ dim_t trimIndex(int idx, const dim_t &len) } template -void assign_(Array out, const array idxrs, const Array rhs) +void assign(Array& out, const af_index_t idxrs[], const Array& rhs) { - bool isSeq[4]; - std::vector seqs(4, af_span); - // create seq vector to retrieve output - // dimensions, offsets & offsets + vector isSeq(4); + vector seqs(4, af_span); + // create seq vector to retrieve output dimensions, offsets & offsets for (dim_t x=0; x<4; ++x) { if (idxrs[x].isSeq) { seqs[x] = idxrs[x].idx.seq; @@ -53,17 +51,7 @@ void assign_(Array out, const array idxrs, const Array rhs) isSeq[x] = idxrs[x].isSeq; } - dim4 dDims = out.getDataDims(); - dim4 pDims = out.dims(); - // retrieve dimensions & strides for array - // to which rhs is being copied to - dim4 dst_offsets = toOffset(seqs, dDims); - dim4 dst_strides = toStride(seqs, dDims); - // retrieve rhs array dimenesions & strides - dim4 src_dims = rhs.dims(); - dim4 src_strides = rhs.strides(); - - std::vector< Array > idxArrs(4, createEmptyArray(dim4())); + vector< Array > idxArrs(4, createEmptyArray(dim4())); // look through indexs to read af_array indexs for (dim_t x=0; x<4; ++x) { if (!isSeq[x]) { @@ -71,58 +59,66 @@ void assign_(Array out, const array idxrs, const Array rhs) } } - // declare pointers to af_array index data - const uint* ptr0 = idxArrs[0].get(); - const uint* ptr1 = idxArrs[1].get(); - const uint* ptr2 = idxArrs[2].get(); - const uint* ptr3 = idxArrs[3].get(); + auto func = [=] (Array out, const Array rhs, + const vector isSeq, + const vector seqs, + const vector< Array > idxArrs) { + + dim4 dDims = out.getDataDims(); + dim4 pDims = out.dims(); + // retrieve dimensions & strides for array to which rhs is being copied to + dim4 dst_offsets = toOffset(seqs, dDims); + dim4 dst_strides = toStride(seqs, dDims); + // retrieve rhs array dimenesions & strides + dim4 src_dims = rhs.dims(); + dim4 src_strides = rhs.strides(); + // declare pointers to af_array index data + const uint* ptr0 = idxArrs[0].get(); + const uint* ptr1 = idxArrs[1].get(); + const uint* ptr2 = idxArrs[2].get(); + const uint* ptr3 = idxArrs[3].get(); - const T * src= rhs.get(); - T * dst = out.get(); + const T * src= rhs.get(); + T * dst = out.get(); - for(dim_t l=0; l -void assign(Array& out, const af_index_t idxrs[], const Array& rhs) -{ - array idx; - copy(idxrs, idxrs+4, begin(idx)); - getQueue().enqueue(assign_, out, move(idx), rhs); + getQueue().enqueue(func, out, rhs, std::move(isSeq), std::move(seqs), std::move(idxArrs)); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp index 39502e9683..c1beeea9c0 100644 --- a/src/backend/cpu/index.cpp +++ b/src/backend/cpu/index.cpp @@ -19,6 +19,7 @@ #include #include +using std::vector; using af::dim4; namespace cpu @@ -40,11 +41,11 @@ dim_t trimIndex(dim_t idx, const dim_t &len) template Array index(const Array& in, const af_index_t idxrs[]) { - bool isSeq[4]; - std::vector seqs(4, af_span); + vector isSeq(4); + vector seqs(4, af_span); // create seq vector to retrieve output // dimensions, offsets & offsets - for (dim_t x=0; x<4; ++x) { + for (dim_t x=0; x index(const Array& in, const af_index_t idxrs[]) // retrieve dim4 oDims = toDims(seqs, in.dims()); - std::vector< Array > idxArrs(4, createEmptyArray(dim4())); + vector< Array > idxArrs(4, createEmptyArray(dim4())); // look through indexs to read af_array indexs - for (dim_t x=0; x<4; ++x) { + for (dim_t x=0; x(idxrs[x].idx.arr); // set output array ith dimension value @@ -66,11 +67,10 @@ Array index(const Array& in, const af_index_t idxrs[]) Array out = createEmptyArray(oDims); - auto func = [=] (Array out, const Array in, - const bool isSeq[], - const std::vector seqs, - const std::vector< Array > idxArrs) { + const vector isSeq, + const vector seqs, + const vector< Array > idxArrs) { const dim4 iDims = in.dims(); const dim4 dDims = in.getDataDims(); From 0aeed429006dcf95a2aa502bf7aad5e409b90a3b Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 19 Nov 2015 16:54:34 -0500 Subject: [PATCH 019/288] converted cpu tile to asychronous call This fixed `Assign.LinearAssignSeq` unit test in assign unit tests. --- src/backend/cpu/tile.cpp | 58 +++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp index 77e72afd09..f7560121f4 100644 --- a/src/backend/cpu/tile.cpp +++ b/src/backend/cpu/tile.cpp @@ -11,25 +11,32 @@ #include #include #include +#include +#include namespace cpu { - template - Array tile(const Array &in, const af::dim4 &tileDims) - { - const af::dim4 iDims = in.dims(); - af::dim4 oDims = iDims; - oDims *= tileDims; - if(iDims.elements() == 0 || oDims.elements() == 0) { - throw std::runtime_error("Elements are 0"); - } +template +Array tile(const Array &in, const af::dim4 &tileDims) +{ + const af::dim4 iDims = in.dims(); + af::dim4 oDims = iDims; + oDims *= tileDims; + + if(iDims.elements() == 0 || oDims.elements() == 0) { + throw std::runtime_error("Elements are 0"); + } - Array out = createEmptyArray(oDims); + Array out = createEmptyArray(oDims); + + auto func = [=] (Array out, const Array in) { T* outPtr = out.get(); const T* inPtr = in.get(); + const af::dim4 iDims = in.dims(); + const af::dim4 oDims = out.dims(); const af::dim4 ist = in.strides(); const af::dim4 ost = out.strides(); @@ -54,24 +61,27 @@ namespace cpu } } } + }; - return out; - } + getQueue().enqueue(func, out, in); + + return out; +} #define INSTANTIATE(T) \ template Array tile(const Array &in, const af::dim4 &tileDims); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) } From 330ae1c3b7fc85c8794069a9a4fa09b43615493e Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 19 Nov 2015 17:06:40 -0500 Subject: [PATCH 020/288] converted sort_index cpu function to asynchronous call This also fixed assign unit test: `ArrayAssign.CPP_ASSIGN_VECTOR_2D` --- src/backend/cpu/sort_index.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index eb6b4bee60..f07d585b41 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -14,16 +14,12 @@ #include #include #include -#include -#include +#include +#include using std::greater; using std::less; using std::sort; -using std::function; -using std::queue; -using std::future; -using std::async; namespace cpu { @@ -85,8 +81,7 @@ namespace cpu val = createEmptyArray(in.dims()); idx = createEmptyArray(in.dims()); switch(dim) { - case 0: sort0_index(val, idx, in); - break; + case 0: getQueue().enqueue(sort0_index, val, idx, in); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } } From 5eea071e457ca37c2a285365c42f9f1c1bd1d0c0 Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 19 Nov 2015 18:11:16 -0500 Subject: [PATCH 021/288] converted triangle fn in cpu backend to async call --- src/backend/cpu/triangle.cpp | 56 ++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp index 6b0f326aad..ed7f348bad 100644 --- a/src/backend/cpu/triangle.cpp +++ b/src/backend/cpu/triangle.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include namespace cpu { @@ -19,42 +21,46 @@ namespace cpu template void triangle(Array &out, const Array &in) { - T *o = out.get(); - const T *i = in.get(); + auto func = [=] (Array out, const Array in) { + T *o = out.get(); + const T *i = in.get(); - dim4 odm = out.dims(); + dim4 odm = out.dims(); - dim4 ost = out.strides(); - dim4 ist = in.strides(); + dim4 ost = out.strides(); + dim4 ist = in.strides(); - for(dim_t ow = 0; ow < odm[3]; ow++) { - const dim_t oW = ow * ost[3]; - const dim_t iW = ow * ist[3]; + for(dim_t ow = 0; ow < odm[3]; ow++) { + const dim_t oW = ow * ost[3]; + const dim_t iW = ow * ist[3]; - for(dim_t oz = 0; oz < odm[2]; oz++) { - const dim_t oZW = oW + oz * ost[2]; - const dim_t iZW = iW + oz * ist[2]; + for(dim_t oz = 0; oz < odm[2]; oz++) { + const dim_t oZW = oW + oz * ost[2]; + const dim_t iZW = iW + oz * ist[2]; - for(dim_t oy = 0; oy < odm[1]; oy++) { - const dim_t oYZW = oZW + oy * ost[1]; - const dim_t iYZW = iZW + oy * ist[1]; + for(dim_t oy = 0; oy < odm[1]; oy++) { + const dim_t oYZW = oZW + oy * ost[1]; + const dim_t iYZW = iZW + oy * ist[1]; - for(dim_t ox = 0; ox < odm[0]; ox++) { - const dim_t oMem = oYZW + ox; - const dim_t iMem = iYZW + ox; + for(dim_t ox = 0; ox < odm[0]; ox++) { + const dim_t oMem = oYZW + ox; + const dim_t iMem = iYZW + ox; - bool cond = is_upper ? (oy >= ox) : (oy <= ox); - bool do_unit_diag = (is_unit_diag && ox == oy); - if(cond) { - o[oMem] = do_unit_diag ? scalar(1) : i[iMem]; - } else { - o[oMem] = scalar(0); - } + bool cond = is_upper ? (oy >= ox) : (oy <= ox); + bool do_unit_diag = (is_unit_diag && ox == oy); + if(cond) { + o[oMem] = do_unit_diag ? scalar(1) : i[iMem]; + } else { + o[oMem] = scalar(0); + } + } } } } - } + }; + + getQueue().enqueue(func, out, in); } template From 551433e7aaf045d9c22ed4a2c107d7f9f1149b59 Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 19 Nov 2015 19:02:51 -0500 Subject: [PATCH 022/288] converted lu & cholesky decomposition functions to async calls --- src/backend/cpu/cholesky.cpp | 12 +++++++++--- src/backend/cpu/lu.cpp | 20 ++++++++++---------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/backend/cpu/cholesky.cpp b/src/backend/cpu/cholesky.cpp index 57beaa4146..d0bd3c8787 100644 --- a/src/backend/cpu/cholesky.cpp +++ b/src/backend/cpu/cholesky.cpp @@ -18,8 +18,9 @@ #include #include #include - #include +#include +#include namespace cpu { @@ -65,8 +66,13 @@ int cholesky_inplace(Array &in, const bool is_upper) if(is_upper) uplo = 'U'; - int info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, - N, in.get(), in.strides()[1]); + int info = 0; + auto func = [&] (int& info, Array& in) { + info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, N, in.get(), in.strides()[1]); + }; + + getQueue().enqueue(func, info, in); + getQueue().sync(); return info; } diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index 0eefb16816..ed165cba8e 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -17,9 +17,10 @@ #include #include #include - #include #include +#include +#include namespace cpu { @@ -128,23 +129,22 @@ void lu(Array &lower, Array &upper, Array &pivot, const Array &in) lower = createEmptyArray(ldims); upper = createEmptyArray(udims); - lu_split(lower, upper, in_copy); + getQueue().enqueue(lu_split, lower, upper, in_copy); } template Array lu_inplace(Array &in, const bool convert_pivot) { dim4 iDims = in.dims(); - int M = iDims[0]; - int N = iDims[1]; - - Array pivot = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); + Array pivot = createEmptyArray(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1)); - getrf_func()(AF_LAPACK_COL_MAJOR, M, N, - in.get(), in.strides()[1], - pivot.get()); + auto func = [=] (Array in, Array pivot, const bool convert_pivot) { + dim4 iDims = in.dims(); + getrf_func()(AF_LAPACK_COL_MAJOR, iDims[0], iDims[1], in.get(), in.strides()[1], pivot.get()); + if(convert_pivot) convertPivot(pivot, iDims[0]); + }; - if(convert_pivot) convertPivot(pivot, M); + getQueue().enqueue(func, in, pivot, convert_pivot); return pivot; } From ed6d26da36df33e1469194d8b3d05e628a9fc26c Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 19 Nov 2015 19:13:54 -0500 Subject: [PATCH 023/288] svd cpu backend is async now --- src/backend/cpu/svd.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp index 461b9014aa..33bfab75aa 100644 --- a/src/backend/cpu/svd.cpp +++ b/src/backend/cpu/svd.cpp @@ -10,12 +10,13 @@ #include #include #include - #include #if defined(WITH_CPU_LINEAR_ALGEBRA) #include #include +#include +#include namespace cpu { @@ -67,18 +68,21 @@ namespace cpu template void svdInPlace(Array &s, Array &u, Array &vt, Array &in) { - dim4 iDims = in.dims(); - int M = iDims[0]; - int N = iDims[1]; + auto func = [=] (Array s, Array u, Array vt, Array in) { + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; #if defined(USE_MKL) || defined(__APPLE__) - svd_func()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1], - s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]); + svd_func()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1], + s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]); #else - std::vector superb(std::min(M, N)); - svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1], - s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]); + std::vector superb(std::min(M, N)); + svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1], + s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]); #endif + }; + getQueue().enqueue(func, s, u, vt, in); } template From ed730cfcd174110a7483a9d8eca881672bab8e83 Mon Sep 17 00:00:00 2001 From: syurkevi Date: Fri, 20 Nov 2015 13:14:14 -0500 Subject: [PATCH 024/288] adds scatter function --- include/af/defines.h | 16 ++++++++++++++ include/af/graphics.h | 35 ++++++++++++++++++++++++++++++ src/api/c/graphics_common.cpp | 7 +++--- src/api/c/graphics_common.hpp | 2 +- src/api/c/plot.cpp | 40 +++++++++++++++++++++++++++-------- src/api/cpp/graphics.cpp | 6 ++++++ src/api/unified/graphics.cpp | 6 ++++++ 7 files changed, 99 insertions(+), 13 deletions(-) diff --git a/include/af/defines.h b/include/af/defines.h index a25d23996d..2b53baabed 100644 --- a/include/af/defines.h +++ b/include/af/defines.h @@ -378,6 +378,19 @@ typedef enum { AF_ID = 0 } af_someenum_t; +#if AF_API_VERSION >=32 +typedef enum { + AF_MARKER_NONE = 0, + AF_MARKER_POINT = 1, + AF_MARKER_CIRCLE = 2, + AF_MARKER_SQUARE = 3, + AF_MARKER_TRIANGLE = 4, + AF_MARKER_CROSS = 5, + AF_MARKER_PLUS = 6, + AF_MARKER_STAR = 7 +} af_marker_type; +#endif + #ifdef __cplusplus namespace af { @@ -404,6 +417,9 @@ namespace af #if AF_API_VERSION >= 32 typedef af_backend Backend; #endif +#if AF_API_VERSION >= 32 + typedef af_marker_type markerType; +#endif } #endif diff --git a/include/af/graphics.h b/include/af/graphics.h index 5c143c721e..e4286e1ea7 100644 --- a/include/af/graphics.h +++ b/include/af/graphics.h @@ -180,6 +180,20 @@ class AFAPI Window { void plot(const array& X, const array& Y, const char* const title=NULL); + /** + Renders the input arrays as a 2D scatter-plot to the window + + \param[in] X is an \ref array with the x-axis data points + \param[in] Y is an \ref array with the y-axis data points + \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot + \param[in] title parameter is used when this function is called in grid mode + + \note \p X and \p Y should be vectors. + + \ingroup gfx_func_draw + */ + + void scatter(const array& X, const array& Y, const af::markerType marker=AF_MARKER_POINT, const char* const title=NULL); /** Renders the input array as a histogram to the window @@ -371,6 +385,27 @@ AFAPI af_err af_draw_image(const af_window wind, const af_array in, const af_cel */ AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props); +#if AF_API_VERSION >= 32 +/** + C Interface wrapper for drawing an array as a plot + + \param[in] wind is the window handle + \param[in] X is an \ref af_array with the x-axis data points + \param[in] Y is an \ref af_array with the y-axis data points + \param[in] props is structure \ref af_cell that has the properties that are used + \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot + for the current rendering. + + \return \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code + is returned. + + \note \p X and \p Y should be vectors. + + \ingroup gfx_func_draw +*/ +AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, const af_marker_type marker); +#endif + #if AF_API_VERSION >= 32 /** C Interface wrapper for drawing an array as a plot diff --git a/src/api/c/graphics_common.cpp b/src/api/c/graphics_common.cpp index 4b50bc046e..92346f59d1 100644 --- a/src/api/c/graphics_common.cpp +++ b/src/api/c/graphics_common.cpp @@ -161,7 +161,7 @@ fg::Image* ForgeManager::getImage(int w, int h, fg::ChannelFormat mode, fg::dtyp return mImgMap[key]; } -fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype type) +fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype) { /* nPoints needs to fall in the range of [0, 2^48] * for the ForgeManager to correctly retrieve @@ -169,11 +169,12 @@ fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype type) * is a limitation on how big of an plot graph can be rendered * using arrayfire graphics funtionality */ assert(nPoints <= 2ll<<48); - long long key = ((nPoints & _48BIT) << 48) | (type & _16BIT); + long long key = ((nPoints & _48BIT) << 48); + key |= (((((dtype & 0x000F) << 12) | (ptype & 0x000F)) << 8) | (mtype & 0x000F)); PltMapIter iter = mPltMap.find(key); if (iter==mPltMap.end()) { - fg::Plot* temp = new fg::Plot(nPoints, type); + fg::Plot* temp = new fg::Plot(nPoints, dtype, ptype, mtype); mPltMap[key] = temp; } diff --git a/src/api/c/graphics_common.hpp b/src/api/c/graphics_common.hpp index 39225e6a0c..caadb88cd9 100644 --- a/src/api/c/graphics_common.hpp +++ b/src/api/c/graphics_common.hpp @@ -82,7 +82,7 @@ class ForgeManager fg::Font* getFont(const bool dontCreate=false); fg::Window* getMainWindow(const bool dontCreate=false); fg::Image* getImage(int w, int h, fg::ChannelFormat mode, fg::dtype type); - fg::Plot* getPlot(int nPoints, fg::dtype type); + fg::Plot* getPlot(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype); fg::Plot3* getPlot3(int nPoints, fg::dtype type); fg::Histogram* getHistogram(int nBins, fg::dtype type); fg::Surface* getSurface(int nX, int nY, fg::dtype type); diff --git a/src/api/c/plot.cpp b/src/api/c/plot.cpp index b22e92850b..f2740305ea 100644 --- a/src/api/c/plot.cpp +++ b/src/api/c/plot.cpp @@ -27,7 +27,7 @@ using namespace detail; using namespace graphics; template -fg::Plot* setup_plot(const af_array X, const af_array Y) +fg::Plot* setup_plot(const af_array X, const af_array Y, fg::PlotType type, fg::MarkerType marker) { Array xIn = getArray(X); Array yIn = getArray(Y); @@ -46,7 +46,7 @@ fg::Plot* setup_plot(const af_array X, const af_array Y) af::dim4 X_dims = Xinfo.dims(); ForgeManager& fgMngr = ForgeManager::getInstance(); - fg::Plot* plot = fgMngr.getPlot(X_dims.elements(), getGLType()); + fg::Plot* plot = fgMngr.getPlot(X_dims.elements(), getGLType(), type, marker); plot->setColor(1.0, 0.0, 0.0); plot->setAxesLimits(xmax, xmin, ymax, ymin); plot->setAxesTitles("X Axis", "Y Axis"); @@ -57,7 +57,7 @@ fg::Plot* setup_plot(const af_array X, const af_array Y) } #endif -af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props) +af_err plotWrapper(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, fg::PlotType type=fg::FG_LINE, fg::MarkerType marker=fg::FG_NONE) { #if defined(WITH_GRAPHICS) if(wind==0) { @@ -85,12 +85,12 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co fg::Plot* plot = NULL; switch(Xtype) { - case f32: plot = setup_plot(X, Y); break; - case s32: plot = setup_plot(X, Y); break; - case u32: plot = setup_plot(X, Y); break; - case s16: plot = setup_plot(X, Y); break; - case u16: plot = setup_plot(X, Y); break; - case u8 : plot = setup_plot(X, Y); break; + case f32: plot = setup_plot(X, Y, type, marker); break; + case s32: plot = setup_plot(X, Y, type, marker); break; + case u32: plot = setup_plot(X, Y, type, marker); break; + case s16: plot = setup_plot(X, Y, type, marker); break; + case u16: plot = setup_plot(X, Y, type, marker); break; + case u8 : plot = setup_plot(X, Y, type, marker); break; default: TYPE_ERROR(1, Xtype); } @@ -105,3 +105,25 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co return AF_ERR_NO_GFX; #endif } + +af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props) +{ + return plotWrapper(wind, X, Y, props); +} + +af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, const af::markerType af_marker) +{ + fg::MarkerType fg_marker; + switch(af_marker){ + case AF_MARKER_NONE: fg_marker = fg::FG_NONE; break; + case AF_MARKER_POINT: fg_marker = fg::FG_POINT; break; + case AF_MARKER_CIRCLE: fg_marker = fg::FG_CIRCLE; break; + case AF_MARKER_SQUARE: fg_marker = fg::FG_SQUARE; break; + case AF_MARKER_TRIANGLE: fg_marker = fg::FG_TRIANGLE; break; + case AF_MARKER_CROSS: fg_marker = fg::FG_CROSS; break; + case AF_MARKER_PLUS: fg_marker = fg::FG_PLUS; break; + case AF_MARKER_STAR: fg_marker = fg::FG_STAR; break; + default: fg_marker = fg::FG_NONE; break; + } + return plotWrapper(wind, X, Y, props, fg::FG_SCATTER, fg_marker); +} diff --git a/src/api/cpp/graphics.cpp b/src/api/cpp/graphics.cpp index b7480195dc..8d2d8cd4b6 100644 --- a/src/api/cpp/graphics.cpp +++ b/src/api/cpp/graphics.cpp @@ -79,6 +79,12 @@ void Window::plot(const array& X, const array& Y, const char* const title) AF_THROW(af_draw_plot(get(), X.get(), Y.get(), &temp)); } +void Window::scatter(const array& X, const array& Y, af::markerType marker, const char* const title) +{ + af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; + AF_THROW(af_draw_scatter(get(), X.get(), Y.get(), &temp, marker)); +} + void Window::plot3(const array& P, const char* const title) { af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; diff --git a/src/api/unified/graphics.cpp b/src/api/unified/graphics.cpp index 81076f233c..596429318f 100644 --- a/src/api/unified/graphics.cpp +++ b/src/api/unified/graphics.cpp @@ -44,6 +44,12 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co return CALL(wind, X, Y, props); } +af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, const af_marker_type marker) +{ + CHECK_ARRAYS(X, Y); + return CALL(wind, X, Y, props, marker); +} + af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props) { CHECK_ARRAYS(P); From 0a78b60dd57056353b3265a56428577db52f98a5 Mon Sep 17 00:00:00 2001 From: syurkevi Date: Fri, 20 Nov 2015 13:15:06 -0500 Subject: [PATCH 025/288] update plot2d example to include scatter plot --- examples/graphics/plot2d.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/graphics/plot2d.cpp b/examples/graphics/plot2d.cpp index 7e28d34ebd..c6d3452c3b 100644 --- a/examples/graphics/plot2d.cpp +++ b/examples/graphics/plot2d.cpp @@ -21,17 +21,21 @@ int main(int argc, char *argv[]) try { // Initialize the kernel array just once af::info(); - af::Window myWindow(512, 512, "2D Plot example: ArrayFire"); + af::Window myWindow(1024, 512, "2D Plot example: ArrayFire"); array Y; int sign = 1; array X = seq(-af::Pi, af::Pi, PRECISION); + myWindow.grid(1, 2); for (double val=-af::Pi; !myWindow.close(); ) { Y = sin(X); - myWindow.plot(X, Y); + myWindow(0,0).plot(X, Y); + myWindow(0,1).scatter(X, Y, AF_MARKER_POINT); + + myWindow.show(); X = X + PRECISION * float(sign); val += PRECISION * float(sign); From e0d7c12d97a69d950623691b4c17c3755b1387f1 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 20 Nov 2015 13:58:35 -0500 Subject: [PATCH 026/288] converted qr & solve cpu functions to async calls Fixed lu async function --- src/backend/cpu/lu.cpp | 66 ++++++++------------ src/backend/cpu/qr.cpp | 56 +++++++---------- src/backend/cpu/solve.cpp | 127 +++++++++++++++++--------------------- 3 files changed, 105 insertions(+), 144 deletions(-) diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index ed165cba8e..ff0be438ee 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -11,7 +11,6 @@ #include #if defined(WITH_CPU_LINEAR_ALGEBRA) - #include #include #include @@ -26,9 +25,7 @@ namespace cpu { template -using getrf_func_def = int (*)(ORDER_TYPE, int, int, - T*, int, - int*); +using getrf_func_def = int (*)(ORDER_TYPE, int, int, T*, int, int*); #define LU_FUNC_DEF( FUNC ) \ template FUNC##_func_def FUNC##_func(); @@ -45,7 +42,7 @@ LU_FUNC(getrf , cfloat , c) LU_FUNC(getrf , cdouble, z) template -void lu_split(Array &lower, Array &upper, const Array &in) +void lu_split(Array lower, Array upper, const Array in) { T *l = lower.get(); T *u = upper.get(); @@ -54,7 +51,6 @@ void lu_split(Array &lower, Array &upper, const Array &in) dim4 ldm = lower.dims(); dim4 udm = upper.dims(); dim4 idm = in.dims(); - dim4 lst = lower.strides(); dim4 ust = upper.strides(); dim4 ist = in.strides(); @@ -79,20 +75,14 @@ void lu_split(Array &lower, Array &upper, const Array &in) const dim_t uMem = uYZW + ox; const dim_t iMem = iYZW + ox; if(ox > oy) { - if(oy < ldm[1]) - l[lMem] = i[iMem]; - if(ox < udm[0]) - u[uMem] = scalar(0); + if(oy < ldm[1]) l[lMem] = i[iMem]; + if(ox < udm[0]) u[uMem] = scalar(0); } else if (oy > ox) { - if(oy < ldm[1]) - l[lMem] = scalar(0); - if(ox < udm[0]) - u[uMem] = i[iMem]; + if(oy < ldm[1]) l[lMem] = scalar(0); + if(ox < udm[0]) u[uMem] = i[iMem]; } else if(ox == oy) { - if(oy < ldm[1]) - l[lMem] = scalar(1.0); - if(ox < udm[0]) - u[uMem] = i[iMem]; + if(oy < ldm[1]) l[lMem] = scalar(1.0); + if(ox < udm[0]) u[uMem] = i[iMem]; } } } @@ -100,17 +90,15 @@ void lu_split(Array &lower, Array &upper, const Array &in) } } -void convertPivot(Array &pivot, int out_sz) +void convertPivot(Array p, Array pivot) { - Array p = range(dim4(out_sz), 0); int *d_pi = pivot.get(); int *d_po = p.get(); - dim_t d0 = pivot.dims()[0]; + dim_t d0 = pivot.dims()[0]; for(int j = 0; j < (int)d0; j++) { // 1 indexed in pivot std::swap(d_po[j], d_po[d_pi[j] - 1]); } - pivot = p; } template @@ -138,26 +126,21 @@ Array lu_inplace(Array &in, const bool convert_pivot) dim4 iDims = in.dims(); Array pivot = createEmptyArray(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1)); - auto func = [=] (Array in, Array pivot, const bool convert_pivot) { + auto func = [=] (Array in, Array pivot) { dim4 iDims = in.dims(); getrf_func()(AF_LAPACK_COL_MAJOR, iDims[0], iDims[1], in.get(), in.strides()[1], pivot.get()); - if(convert_pivot) convertPivot(pivot, iDims[0]); }; - - getQueue().enqueue(func, in, pivot, convert_pivot); - - return pivot; + getQueue().enqueue(func, in, pivot); + + if(convert_pivot) { + Array p = range(dim4(iDims[0]), 0); + getQueue().enqueue(convertPivot, p, pivot); + return p; + } else { + return pivot; + } } -#define INSTANTIATE_LU(T) \ - template Array lu_inplace(Array &in, const bool convert_pivot); \ - template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); - -INSTANTIATE_LU(float) -INSTANTIATE_LU(cfloat) -INSTANTIATE_LU(double) -INSTANTIATE_LU(cdouble) - } #else @@ -177,6 +160,12 @@ Array lu_inplace(Array &in, const bool convert_pivot) AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); } +} + +#endif + +namespace cpu +{ #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -185,7 +174,4 @@ INSTANTIATE_LU(float) INSTANTIATE_LU(cfloat) INSTANTIATE_LU(double) INSTANTIATE_LU(cdouble) - } - -#endif diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp index d1c3e233af..b5f18064f5 100644 --- a/src/backend/cpu/qr.cpp +++ b/src/backend/cpu/qr.cpp @@ -11,28 +11,23 @@ #include #if defined(WITH_CPU_LINEAR_ALGEBRA) - #include #include -#include #include #include #include - #include +#include +#include namespace cpu { template -using geqrf_func_def = int (*)(ORDER_TYPE, int, int, - T*, int, - T*); +using geqrf_func_def = int (*)(ORDER_TYPE, int, int, T*, int, T*); template -using gqr_func_def = int (*)(ORDER_TYPE, int, int, int, - T*, int, - const T*); +using gqr_func_def = int (*)(ORDER_TYPE, int, int, int, T*, int, const T*); #define QR_FUNC_DEF( FUNC ) \ template FUNC##_func_def FUNC##_func(); @@ -65,8 +60,8 @@ template void qr(Array &q, Array &r, Array &t, const Array &in) { dim4 iDims = in.dims(); - int M = iDims[0]; - int N = iDims[1]; + int M = iDims[0]; + int N = iDims[1]; q = padArray(in, dim4(M, max(M, N))); q.resetDims(iDims); @@ -78,39 +73,29 @@ void qr(Array &q, Array &r, Array &t, const Array &in) triangle(r, q); - gqr_func()(AF_LAPACK_COL_MAJOR, - M, M, min(M, N), - q.get(), q.strides()[1], - t.get()); - + auto func = [=] (Array q, Array t, int M, int N) { + gqr_func()(AF_LAPACK_COL_MAJOR, M, M, min(M, N), q.get(), q.strides()[1], t.get()); + }; q.resetDims(dim4(M, M)); + getQueue().enqueue(func, q, t, M, N); } template Array qr_inplace(Array &in) { dim4 iDims = in.dims(); - int M = iDims[0]; - int N = iDims[1]; - + int M = iDims[0]; + int N = iDims[1]; Array t = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); - geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, - in.get(), in.strides()[1], - t.get()); + auto func = [=] (Array in, Array t, int M, int N) { + geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, in.get(), in.strides()[1], t.get()); + }; + getQueue().enqueue(func, in, t, M, N); return t; } -#define INSTANTIATE_QR(T) \ - template Array qr_inplace(Array &in); \ - template void qr(Array &q, Array &r, Array &t, const Array &in); - -INSTANTIATE_QR(float) -INSTANTIATE_QR(cfloat) -INSTANTIATE_QR(double) -INSTANTIATE_QR(cdouble) - } #else @@ -130,6 +115,12 @@ Array qr_inplace(Array &in) AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); } +} + +#endif + +namespace cpu +{ #define INSTANTIATE_QR(T) \ template Array qr_inplace(Array &in); \ template void qr(Array &q, Array &r, Array &t, const Array &in); @@ -138,7 +129,4 @@ INSTANTIATE_QR(float) INSTANTIATE_QR(cfloat) INSTANTIATE_QR(double) INSTANTIATE_QR(cdouble) - } - -#endif diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp index 1e88e8d915..b279971c7b 100644 --- a/src/backend/cpu/solve.cpp +++ b/src/backend/cpu/solve.cpp @@ -11,52 +11,40 @@ #include #if defined(WITH_CPU_LINEAR_ALGEBRA) - #include #include -#include -#include #include #include - #include +#include +#include namespace cpu { template using gesv_func_def = int (*)(ORDER_TYPE, int, int, - T *, int, - int *, - T *, int); + T *, int, int *, T *, int); template -using gels_func_def = int (*)(ORDER_TYPE, char, - int, int, int, - T *, int, - T *, int); +using gels_func_def = int (*)(ORDER_TYPE, char, int, int, int, + T *, int, T *, int); template -using getrs_func_def = int (*)(ORDER_TYPE, char, - int, int, - const T *, int, - const int *, - T *, int); +using getrs_func_def = int (*)(ORDER_TYPE, char, int, int, + const T *, int, const int *, T *, int); template -using trtrs_func_def = int (*)(ORDER_TYPE, - char, char, char, - int, int, - const T *, int, - T *, int); +using trtrs_func_def = int (*)(ORDER_TYPE, char, char, char, int, int, + const T *, int, T *, int); -#define SOLVE_FUNC_DEF( FUNC ) \ +#define SOLVE_FUNC_DEF( FUNC ) \ template FUNC##_func_def FUNC##_func(); -#define SOLVE_FUNC( FUNC, TYPE, PREFIX ) \ -template<> FUNC##_func_def FUNC##_func() \ +#define SOLVE_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ { return & LAPACK_NAME(PREFIX##FUNC); } SOLVE_FUNC_DEF( gesv ) @@ -87,16 +75,16 @@ template Array solveLU(const Array &A, const Array &pivot, const Array &b, const af_mat_prop options) { - int N = A.dims()[0]; - int NRHS = b.dims()[1]; - + int N = A.dims()[0]; + int NRHS = b.dims()[1]; Array< T > B = copyArray(b); - getrs_func()(AF_LAPACK_COL_MAJOR, 'N', - N, NRHS, - A.get(), A.strides()[1], - pivot.get(), - B.get(), B.strides()[1]); + auto func = [=] (Array A, Array B, Array pivot, int N, int NRHS) { + getrs_func()(AF_LAPACK_COL_MAJOR, 'N', + N, NRHS, A.get(), A.strides()[1], + pivot.get(), B.get(), B.strides()[1]); + }; + getQueue().enqueue(func, A, B, pivot, N, NRHS); return B; } @@ -105,16 +93,20 @@ template Array triangleSolve(const Array &A, const Array &b, const af_mat_prop options) { Array B = copyArray(b); - int N = B.dims()[0]; - int NRHS = B.dims()[1]; - - trtrs_func()(AF_LAPACK_COL_MAJOR, - options & AF_MAT_UPPER ? 'U' : 'L', - 'N', // transpose flag - options & AF_MAT_DIAG_UNIT ? 'U' : 'N', - N, NRHS, - A.get(), A.strides()[1], - B.get(), B.strides()[1]); + int N = B.dims()[0]; + int NRHS = B.dims()[1]; + + auto func = [=] (Array A, Array B, int N, int NRHS, const af_mat_prop options) { + trtrs_func()(AF_LAPACK_COL_MAJOR, + options & AF_MAT_UPPER ? 'U' : 'L', + 'N', // transpose flag + options & AF_MAT_DIAG_UNIT ? 'U' : 'N', + N, NRHS, + A.get(), A.strides()[1], + B.get(), B.strides()[1]); + }; + getQueue().enqueue(func, A, B, N, NRHS, options); + return B; } @@ -132,41 +124,34 @@ Array solve(const Array &a, const Array &b, const af_mat_prop options) int N = a.dims()[1]; int K = b.dims()[1]; - Array A = copyArray(a); Array B = padArray(b, dim4(max(M, N), K)); if(M == N) { Array pivot = createEmptyArray(dim4(N, 1, 1)); - gesv_func()(AF_LAPACK_COL_MAJOR, N, K, - A.get(), A.strides()[1], - pivot.get(), - B.get(), B.strides()[1]); + + auto func = [=] (Array A, Array B, Array pivot, int N, int K) { + gesv_func()(AF_LAPACK_COL_MAJOR, N, K, A.get(), A.strides()[1], + pivot.get(), B.get(), B.strides()[1]); + }; + getQueue().enqueue(func, A, B, pivot, N, K); } else { - int sM = a.strides()[1]; - int sN = a.strides()[2] / sM; + auto func = [=] (Array A, Array B, int M, int N, int K) { + int sM = A.strides()[1]; + int sN = A.strides()[2] / sM; - gels_func()(AF_LAPACK_COL_MAJOR, 'N', - M, N, K, - A.get(), A.strides()[1], - B.get(), max(sM, sN)); + gels_func()(AF_LAPACK_COL_MAJOR, 'N', + M, N, K, + A.get(), A.strides()[1], + B.get(), max(sM, sN)); + }; B.resetDims(dim4(N, K)); + getQueue().enqueue(func, A, B, M, N, K); } return B; } -#define INSTANTIATE_SOLVE(T) \ - template Array solve(const Array &a, const Array &b, \ - const af_mat_prop options); \ - template Array solveLU(const Array &A, const Array &pivot, \ - const Array &b, const af_mat_prop options); \ - -INSTANTIATE_SOLVE(float) -INSTANTIATE_SOLVE(cfloat) -INSTANTIATE_SOLVE(double) -INSTANTIATE_SOLVE(cdouble) - } #else @@ -178,17 +163,21 @@ template Array solveLU(const Array &A, const Array &pivot, const Array &b, const af_mat_prop options) { - AF_ERROR("Linear Algebra is diabled on CPU", - AF_ERR_NOT_CONFIGURED); + AF_ERROR("Linear Algebra is diabled on CPU", AF_ERR_NOT_CONFIGURED); } template Array solve(const Array &a, const Array &b, const af_mat_prop options) { - AF_ERROR("Linear Algebra is diabled on CPU", - AF_ERR_NOT_CONFIGURED); + AF_ERROR("Linear Algebra is diabled on CPU", AF_ERR_NOT_CONFIGURED); +} + } +#endif + +namespace cpu +{ #define INSTANTIATE_SOLVE(T) \ template Array solve(const Array &a, const Array &b, \ const af_mat_prop options); \ @@ -200,5 +189,3 @@ INSTANTIATE_SOLVE(cfloat) INSTANTIATE_SOLVE(double) INSTANTIATE_SOLVE(cdouble) } - -#endif From d0223f980047dfee315569eaf359105377e978b7 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 20 Nov 2015 15:48:10 -0500 Subject: [PATCH 027/288] Converted wrap & unwrap cpu fns to async calls --- src/backend/cpu/unwrap.cpp | 173 +++++++++++++++++++------------------ src/backend/cpu/wrap.cpp | 171 ++++++++++++++++++------------------ 2 files changed, 175 insertions(+), 169 deletions(-) diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp index f9c25f9a9e..efb46be7f4 100644 --- a/src/backend/cpu/unwrap.cpp +++ b/src/backend/cpu/unwrap.cpp @@ -13,112 +13,115 @@ #include #include #include +#include +#include namespace cpu { - template - void unwrap_dim(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, - const dim_t px, const dim_t py) - { - dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; - - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - - dim_t cOut = w * ostrides[3] + z * ostrides[2]; - dim_t cIn = w * istrides[3] + z * istrides[2]; - const T* iptr = inPtr + cIn; - T* optr_= outPtr + cOut; - - for(dim_t col = 0; col < odims[d]; col++) { - // Offset output ptr - T* optr = optr_ + col * ostrides[d]; - - // Calculate input window index - dim_t winy = (col / nx); - dim_t winx = (col % nx); - - dim_t startx = winx * sx; - dim_t starty = winy * sy; - - dim_t spx = startx - px; - dim_t spy = starty - py; - - // Short cut condition ensuring all values within input dimensions - bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]); - - for(dim_t y = 0; y < wy; y++) { - for(dim_t x = 0; x < wx; x++) { - dim_t xpad = spx + x; - dim_t ypad = spy + y; - - dim_t oloc = (y * wx + x); - if (d == 0) oloc *= ostrides[1]; - - if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) { - dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]); - optr[oloc] = iptr[iloc]; - } else { - optr[oloc] = scalar(0.0); - } + +template +void unwrap_dim(Array out, const Array in, const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) +{ + const T *inPtr = in.get(); + T *outPtr = out.get(); + + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + af::dim4 istrides = in.strides(); + af::dim4 ostrides = out.strides(); + + dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + + dim_t cOut = w * ostrides[3] + z * ostrides[2]; + dim_t cIn = w * istrides[3] + z * istrides[2]; + const T* iptr = inPtr + cIn; + T* optr_= outPtr + cOut; + + for(dim_t col = 0; col < odims[d]; col++) { + // Offset output ptr + T* optr = optr_ + col * ostrides[d]; + + // Calculate input window index + dim_t winy = (col / nx); + dim_t winx = (col % nx); + + dim_t startx = winx * sx; + dim_t starty = winy * sy; + + dim_t spx = startx - px; + dim_t spy = starty - py; + + // Short cut condition ensuring all values within input dimensions + bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]); + + for(dim_t y = 0; y < wy; y++) { + for(dim_t x = 0; x < wx; x++) { + dim_t xpad = spx + x; + dim_t ypad = spy + y; + + dim_t oloc = (y * wx + x); + if (d == 0) oloc *= ostrides[1]; + + if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) { + dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]); + optr[oloc] = iptr[iloc]; + } else { + optr[oloc] = scalar(0.0); } } } } } } +} - template - Array unwrap(const Array &in, const dim_t wx, const dim_t wy, - const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column) - { - af::dim4 idims = in.dims(); - - dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; - dim_t ny = (idims[1] + 2 * py - wy) / sy + 1; - - af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]); +template +Array unwrap(const Array &in, const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column) +{ + af::dim4 idims = in.dims(); - if (!is_column) { - std::swap(odims[0], odims[1]); - } + dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; + dim_t ny = (idims[1] + 2 * py - wy) / sy + 1; - // Create output placeholder - Array outArray = createEmptyArray(odims); + af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]); - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); + if (!is_column) { + std::swap(odims[0], odims[1]); + } - af::dim4 ostrides = outArray.strides(); - af::dim4 istrides = in.strides(); + Array outArray = createEmptyArray(odims); - if (is_column) { - unwrap_dim(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py); - } else { - unwrap_dim(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py); - } - return outArray; + if (is_column) { + getQueue().enqueue(unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); + } else { + getQueue().enqueue(unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); } + return outArray; +} + #define INSTANTIATE(T) \ template Array unwrap (const Array &in, const dim_t wx, const dim_t wy, \ const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column); - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp index a04a6f5250..3ff54de640 100644 --- a/src/backend/cpu/wrap.cpp +++ b/src/backend/cpu/wrap.cpp @@ -13,92 +13,95 @@ #include #include #include +#include +#include namespace cpu { - template - void wrap_dim(T *outPtr, const T *inPtr, - const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t wx, const dim_t wy, - const dim_t sx, const dim_t sy, - const dim_t px, const dim_t py) - { - dim_t nx = (odims[0] + 2 * px - wx) / sx + 1; - - for(dim_t w = 0; w < idims[3]; w++) { - for(dim_t z = 0; z < idims[2]; z++) { - - dim_t cIn = w * istrides[3] + z * istrides[2]; - dim_t cOut = w * ostrides[3] + z * ostrides[2]; - const T* iptr_ = inPtr + cIn; - T* optr= outPtr + cOut; - - for(dim_t col = 0; col < idims[d]; col++) { - // Offset output ptr - const T* iptr = iptr_ + col * istrides[d]; - - // Calculate input window index - dim_t winy = (col / nx); - dim_t winx = (col % nx); - - dim_t startx = winx * sx; - dim_t starty = winy * sy; - - dim_t spx = startx - px; - dim_t spy = starty - py; - - // Short cut condition ensuring all values within input dimensions - bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]); - - for(dim_t y = 0; y < wy; y++) { - for(dim_t x = 0; x < wx; x++) { - dim_t xpad = spx + x; - dim_t ypad = spy + y; - - dim_t iloc = (y * wx + x); - if (d == 0) iloc *= istrides[1]; - - if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) { - dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]); - // FIXME: When using threads, atomize this - optr[oloc] += iptr[iloc]; - } +template +void wrap_dim(Array out, const Array in, const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) +{ + const T *inPtr = in.get(); + T *outPtr = out.get(); + + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + af::dim4 istrides = in.strides(); + af::dim4 ostrides = out.strides(); + + dim_t nx = (odims[0] + 2 * px - wx) / sx + 1; + + for(dim_t w = 0; w < idims[3]; w++) { + for(dim_t z = 0; z < idims[2]; z++) { + + dim_t cIn = w * istrides[3] + z * istrides[2]; + dim_t cOut = w * ostrides[3] + z * ostrides[2]; + const T* iptr_ = inPtr + cIn; + T* optr= outPtr + cOut; + + for(dim_t col = 0; col < idims[d]; col++) { + // Offset output ptr + const T* iptr = iptr_ + col * istrides[d]; + + // Calculate input window index + dim_t winy = (col / nx); + dim_t winx = (col % nx); + + dim_t startx = winx * sx; + dim_t starty = winy * sy; + + dim_t spx = startx - px; + dim_t spy = starty - py; + + // Short cut condition ensuring all values within input dimensions + bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]); + + for(dim_t y = 0; y < wy; y++) { + for(dim_t x = 0; x < wx; x++) { + dim_t xpad = spx + x; + dim_t ypad = spy + y; + + dim_t iloc = (y * wx + x); + if (d == 0) iloc *= istrides[1]; + + if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) { + dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]); + // FIXME: When using threads, atomize this + optr[oloc] += iptr[iloc]; } } } } } } +} - template - Array wrap(const Array &in, - const dim_t ox, const dim_t oy, - const dim_t wx, const dim_t wy, - const dim_t sx, const dim_t sy, - const dim_t px, const dim_t py, - const bool is_column) - { - af::dim4 idims = in.dims(); - af::dim4 odims(ox, oy, idims[2], idims[3]); - Array out = createValueArray(odims, scalar(0)); - - const T *inPtr = in.get(); - T *outPtr = out.get(); - - af::dim4 istrides = in.strides(); - af::dim4 ostrides = out.strides(); - - if (is_column) { - wrap_dim(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py); - } else { - wrap_dim(outPtr, inPtr, odims, idims, ostrides, istrides, wx, wy, sx, sy, px, py); - } +template +Array wrap(const Array &in, + const dim_t ox, const dim_t oy, + const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, + const dim_t px, const dim_t py, + const bool is_column) +{ + af::dim4 idims = in.dims(); + af::dim4 odims(ox, oy, idims[2], idims[3]); + + Array out = createValueArray(odims, scalar(0)); + out.eval(); + in.eval(); - return out; + if (is_column) { + getQueue().enqueue(wrap_dim, out, in, wx, wy, sx, sy, px, py); + } else { + getQueue().enqueue(wrap_dim, out, in, wx, wy, sx, sy, px, py); } + return out; +} + #define INSTANTIATE(T) \ template Array wrap (const Array &in, \ @@ -108,17 +111,17 @@ namespace cpu const dim_t px, const dim_t py, \ const bool is_column); +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) } From 32a65d8f390e2893ba91ef6742365f9fd8e7c3c4 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 20 Nov 2015 16:58:13 -0500 Subject: [PATCH 028/288] converted transform to async call --- src/backend/cpu/transform.cpp | 230 +++++++++++++++++----------------- 1 file changed, 116 insertions(+), 114 deletions(-) diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp index 68e8d96eba..f4a05148c5 100644 --- a/src/backend/cpu/transform.cpp +++ b/src/backend/cpu/transform.cpp @@ -12,136 +12,138 @@ #include #include #include +#include +#include #include "transform_interp.hpp" namespace cpu { - template - void calc_affine_inverse(T *txo, const T *txi) - { - T det = txi[0]*txi[4] - txi[1]*txi[3]; - - txo[0] = txi[4] / det; - txo[1] = txi[3] / det; - txo[3] = txi[1] / det; - txo[4] = txi[0] / det; - - txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; - txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; - } - template - void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse) - { - // The way kernel is structured, it expects an inverse - // transform matrix by default. - // If it is an forward transform, then we need its inverse - if(inverse) { - for(int i = 0; i < 6; i++) - tmat[i] = tmat_ptr[i]; - } else { - calc_affine_inverse(tmat, tmat_ptr); - } +template +void calc_affine_inverse(T *txo, const T *txi) +{ + T det = txi[0]*txi[4] - txi[1]*txi[3]; + + txo[0] = txi[4] / det; + txo[1] = txi[3] / det; + txo[3] = txi[1] / det; + txo[4] = txi[0] / det; + + txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; + txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; +} + +template +void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse) +{ + // The way kernel is structured, it expects an inverse + // transform matrix by default. + // If it is an forward transform, then we need its inverse + if(inverse) { + for(int i = 0; i < 6; i++) + tmat[i] = tmat_ptr[i]; + } else { + calc_affine_inverse(tmat, tmat_ptr); } +} - template - void transform_(T *out, const T *in, const float *tf, - const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &tstrides, const bool inverse) - { - dim_t nimages = idims[2]; - // Multiplied in src/backend/transform.cpp - dim_t ntransforms = odims[2] / idims[2]; - - void (*t_fn)(T *, const T *, const float *, const af::dim4 &, - const af::dim4 &, const af::dim4 &, - const dim_t, const dim_t, const dim_t, const dim_t); - - switch(method) { - case AF_INTERP_NEAREST: - t_fn = &transform_n; - break; - case AF_INTERP_BILINEAR: - t_fn = &transform_b; - break; - case AF_INTERP_LOWER: - t_fn = &transform_l; - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } +template +void transform_(Array output, const Array input, + const Array transform, const bool inverse) +{ + const af::dim4 idims = input.dims(); + const af::dim4 odims = output.dims(); + const af::dim4 istrides = input.strides(); + const af::dim4 ostrides = output.strides(); + + T * out = output.get(); + const T * in = input.get(); + const float* tf = transform.get(); + + dim_t nimages = idims[2]; + // Multiplied in src/backend/transform.cpp + dim_t ntransforms = odims[2] / idims[2]; + + void (*t_fn)(T *, const T *, const float *, const af::dim4 &, + const af::dim4 &, const af::dim4 &, + const dim_t, const dim_t, const dim_t, const dim_t); + + switch(method) { + case AF_INTERP_NEAREST: + t_fn = &transform_n; + break; + case AF_INTERP_BILINEAR: + t_fn = &transform_b; + break; + case AF_INTERP_LOWER: + t_fn = &transform_l; + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } - // For each transform channel - for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) { - // Compute inverse if required - const float *tmat_ptr = tf + t_idx * 6; - float tmat[6]; - calc_affine_inverse(tmat, tmat_ptr, inverse); + // For each transform channel + for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) { + // Compute inverse if required + const float *tmat_ptr = tf + t_idx * 6; + float tmat[6]; + calc_affine_inverse(tmat, tmat_ptr, inverse); - // Offset for output pointer - dim_t o_offset = t_idx * nimages * ostrides[2]; + // Offset for output pointer + dim_t o_offset = t_idx * nimages * ostrides[2]; - // Do transform for image - for(int yy = 0; yy < (int)odims[1]; yy++) { - for(int xx = 0; xx < (int)odims[0]; xx++) { - t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy); - } + // Do transform for image + for(int yy = 0; yy < (int)odims[1]; yy++) { + for(int xx = 0; xx < (int)odims[0]; xx++) { + t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy); } } } +} - template - Array transform(const Array &in, const Array &transform, const af::dim4 &odims, - const af_interp_type method, const bool inverse) - { - const af::dim4 idims = in.dims(); - - Array out = createEmptyArray(odims); - - switch(method) { - case AF_INTERP_NEAREST: - transform_ - (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); - break; - case AF_INTERP_BILINEAR: - transform_ - (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); - break; - case AF_INTERP_LOWER: - transform_ - (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } - - return out; +template +Array transform(const Array &in, const Array &transform, const af::dim4 &odims, + const af_interp_type method, const bool inverse) +{ + Array out = createEmptyArray(odims); + in.eval(); + + switch(method) { + case AF_INTERP_NEAREST : + getQueue().enqueue(transform_, out, in, transform, inverse); + break; + case AF_INTERP_BILINEAR: + getQueue().enqueue(transform_, out, in, transform, inverse); + break; + case AF_INTERP_LOWER : + getQueue().enqueue(transform_, out, in, transform, inverse); + break; + default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } + return out; +} + + +#define INSTANTIATE(T) \ +template Array transform(const Array &in, const Array &transform, \ + const af::dim4 &odims, const af_interp_type method, \ + const bool inverse); + + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) -#define INSTANTIATE(T) \ - template Array transform(const Array &in, const Array &transform, \ - const af::dim4 &odims, const af_interp_type method, \ - const bool inverse); - - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) } From 743fb4a1e57dea918dc4465d4a25dc0279a286a4 Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 23 Nov 2015 13:28:55 -0500 Subject: [PATCH 029/288] converted susan fn in cpu backend to asynchronous call --- src/backend/cpu/susan.cpp | 59 +++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp index 77493915c0..e2c908c378 100644 --- a/src/backend/cpu/susan.cpp +++ b/src/backend/cpu/susan.cpp @@ -11,18 +11,25 @@ #include #include #include +#include +#include +#include using af::features; +using std::shared_ptr; namespace cpu { template -void susan_responses(T* resp_out, const T* in, +void susan_responses(Array output, const Array input, const unsigned idim0, const unsigned idim1, const int radius, const float t, const float g, const unsigned border_len) { + T* resp_out = output.get(); + const T* in = input.get(); + const unsigned r = border_len; const int rSqrd = radius*radius; @@ -51,10 +58,16 @@ void susan_responses(T* resp_out, const T* in, } template -void non_maximal(float* x_out, float* y_out, float* resp_out, - unsigned* count, const unsigned idim0, const unsigned idim1, - const T* resp_in, const unsigned border_len, const unsigned max_corners) +void non_maximal(Array xcoords, Array ycoords, Array response, + shared_ptr counter, const unsigned idim0, const unsigned idim1, + const Array input, const unsigned border_len, const unsigned max_corners) { + float* x_out = xcoords.get(); + float* y_out = ycoords.get(); + float* resp_out = response.get(); + unsigned* count = counter.get(); + const T* resp_in= input.get(); + // Responses on the border don't have 8-neighbors to compare, discard them const unsigned r = border_len + 1; @@ -94,36 +107,34 @@ unsigned susan(Array &x_out, Array &y_out, Array &resp_out, const float feature_ratio, const unsigned edge) { dim4 idims = in.dims(); - const unsigned corner_lim = in.elements() * feature_ratio; - float* x_corners = memAlloc(corner_lim); - float* y_corners = memAlloc(corner_lim); - float* resp_corners = memAlloc(corner_lim); - T* resp = memAlloc(in.elements()); - unsigned corners_found = 0; + auto x_corners = createEmptyArray(dim4(corner_lim)); + auto y_corners = createEmptyArray(dim4(corner_lim)); + auto resp_corners = createEmptyArray(dim4(corner_lim)); + auto response = createEmptyArray(dim4(in.elements())); + auto corners_found= std::shared_ptr(memAlloc(1), memFree); + corners_found.get()[0] = 0; - susan_responses(resp, in.get(), idims[0], idims[1], radius, diff_thr, geom_thr, edge); + getQueue().enqueue(susan_responses, response, in, idims[0], idims[1], + radius, diff_thr, geom_thr, edge); + getQueue().enqueue(non_maximal, x_corners, y_corners, resp_corners, corners_found, + idims[0], idims[1], response, edge, corner_lim); + getQueue().sync(); - non_maximal(x_corners, y_corners, resp_corners, &corners_found, - idims[0], idims[1], resp, edge, corner_lim); - - memFree(resp); - - const unsigned corners_out = min(corners_found, corner_lim); + const unsigned corners_out = min((corners_found.get())[0], corner_lim); if (corners_out == 0) { - memFree(x_corners); - memFree(y_corners); - memFree(resp_corners); x_out = createEmptyArray(dim4()); y_out = createEmptyArray(dim4()); resp_out = createEmptyArray(dim4()); return 0; } else { - - x_out = createDeviceDataArray(dim4(corners_out), (void*)x_corners); - y_out = createDeviceDataArray(dim4(corners_out), (void*)y_corners); - resp_out = createDeviceDataArray(dim4(corners_out), (void*)resp_corners); + x_out = x_corners; + y_out = y_corners; + resp_out = resp_corners; + x_out.resetDims(dim4(corners_out)); + y_out.resetDims(dim4(corners_out)); + resp_out.resetDims(dim4(corners_out)); return corners_out; } } From 840af46e2ccce3ee3b0e23b65ac18f28231b9a1e Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 23 Nov 2015 13:35:11 -0500 Subject: [PATCH 030/288] convert sort & sort_by_key cpu fns to async calls --- src/backend/cpu/sort.cpp | 7 ++++--- src/backend/cpu/sort_by_key.cpp | 9 ++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index 0b3fb9aabe..94d70a8e49 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include using std::greater; using std::less; @@ -29,7 +31,7 @@ namespace cpu // Based off of http://stackoverflow.com/a/12399290 template - void sort0(Array &val) + void sort0(Array val) { // initialize original index locations T *val_ptr = val.get(); @@ -62,8 +64,7 @@ namespace cpu { Array out = copyArray(in); switch(dim) { - case 0: sort0(out); - break; + case 0: getQueue().enqueue(sort0, out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } return out; diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index 4b0a092834..684b9bac58 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -15,14 +15,14 @@ #include #include #include -#include +#include +#include using std::greater; using std::less; using std::sort; using std::function; using std::queue; -using std::future; using std::async; namespace cpu @@ -32,7 +32,7 @@ namespace cpu /////////////////////////////////////////////////////////////////////////// template - void sort0_by_key(Array &okey, Array &oval, const Array &ikey, const Array &ival) + void sort0_by_key(Array okey, Array oval, const Array ikey, const Array ival) { function op = greater(); if(isAscending) { op = less(); } @@ -101,8 +101,7 @@ namespace cpu okey = createEmptyArray(ikey.dims()); oval = createEmptyArray(ival.dims()); switch(dim) { - case 0: sort0_by_key(okey, oval, ikey, ival); - break; + case 0: getQueue().enqueue(sort0_by_key, okey, oval, ikey, ival); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } } From e0055579305bb4969f0f3bf5855968387ad2c7da Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 23 Nov 2015 13:43:08 -0500 Subject: [PATCH 031/288] sobel cpu fn is async fn after this change --- src/backend/cpu/sobel.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp index 3c6b1740d5..9f683fc450 100644 --- a/src/backend/cpu/sobel.cpp +++ b/src/backend/cpu/sobel.cpp @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include using af::dim4; @@ -22,8 +23,13 @@ namespace cpu { template -void derivative(To *optr, Ti const *iptr, dim4 const &dims, dim4 const &strides) +void derivative(Array output, const Array input) { + const dim4 dims = input.dims(); + const dim4 strides = input.strides(); + To* optr = output.get(); + const Ti* iptr = input.get(); + for(dim_t b3=0; b3 std::pair< Array, Array > sobelDerivatives(const Array &img, const unsigned &ker_size) { + // ket_size is for future proofing, this argument is not used + // currently Array dx = createEmptyArray(img.dims()); Array dy = createEmptyArray(img.dims()); - derivative(dx.get(), img.get(), img.dims(), img.strides()); - derivative(dy.get(), img.get(), img.dims(), img.strides()); + getQueue().enqueue(derivative, dx, img); + getQueue().enqueue(derivative, dy, img); return std::make_pair(dx, dy); } -#define INSTANTIATE(Ti, To) \ +#define INSTANTIATE(Ti, To) \ template std::pair< Array, Array > \ sobelDerivatives(const Array &img, const unsigned &ker_size); From 4ec314a225140e0d0078fc4542dab2ce65ce0c04 Mon Sep 17 00:00:00 2001 From: syurkevi Date: Mon, 23 Nov 2015 15:08:49 -0500 Subject: [PATCH 032/288] cleanup and scatter example update --- examples/graphics/plot2d.cpp | 5 +++-- src/api/cpp/graphics.cpp | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/graphics/plot2d.cpp b/examples/graphics/plot2d.cpp index c6d3452c3b..7593f8a602 100644 --- a/examples/graphics/plot2d.cpp +++ b/examples/graphics/plot2d.cpp @@ -13,7 +13,7 @@ using namespace af; -static const int ITERATIONS = 100; +static const int ITERATIONS = 50; static const float PRECISION = 1.0f/ITERATIONS; int main(int argc, char *argv[]) @@ -26,6 +26,7 @@ int main(int argc, char *argv[]) array Y; int sign = 1; array X = seq(-af::Pi, af::Pi, PRECISION); + array noise = randn(X.dims(0))/5.f; myWindow.grid(1, 2); for (double val=-af::Pi; !myWindow.close(); ) { @@ -33,7 +34,7 @@ int main(int argc, char *argv[]) Y = sin(X); myWindow(0,0).plot(X, Y); - myWindow(0,1).scatter(X, Y, AF_MARKER_POINT); + myWindow(0,1).scatter(X, Y + noise, AF_MARKER_POINT); myWindow.show(); diff --git a/src/api/cpp/graphics.cpp b/src/api/cpp/graphics.cpp index 8d2d8cd4b6..cb9b0803e7 100644 --- a/src/api/cpp/graphics.cpp +++ b/src/api/cpp/graphics.cpp @@ -99,7 +99,6 @@ void Window::hist(const array& X, const double minval, const double maxval, cons } void Window::surface(const array& S, const char* const title){ - //TODO: fix offset on forge? af::array xVals = seq(0, S.dims(0)-1); af::array yVals = seq(0, S.dims(1)-1); af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; From 6f52c36b5bd09657c1b40b16669f03b813916a17 Mon Sep 17 00:00:00 2001 From: syurkevi Date: Mon, 23 Nov 2015 18:38:04 -0500 Subject: [PATCH 033/288] adds scatter3 --- include/af/graphics.h | 36 +++++++++++++++++++++++++++++++---- src/api/c/graphics_common.cpp | 23 +++++++++++++++++++--- src/api/c/graphics_common.hpp | 3 ++- src/api/c/plot.cpp | 15 ++------------- src/api/c/plot3.cpp | 29 +++++++++++++++++++--------- src/api/cpp/graphics.cpp | 8 +++++++- src/api/unified/graphics.cpp | 10 ++++++++-- 7 files changed, 91 insertions(+), 33 deletions(-) diff --git a/include/af/graphics.h b/include/af/graphics.h index e4286e1ea7..129b43949f 100644 --- a/include/af/graphics.h +++ b/include/af/graphics.h @@ -177,7 +177,6 @@ class AFAPI Window { \ingroup gfx_func_draw */ - void plot(const array& X, const array& Y, const char* const title=NULL); /** @@ -192,8 +191,19 @@ class AFAPI Window { \ingroup gfx_func_draw */ - void scatter(const array& X, const array& Y, const af::markerType marker=AF_MARKER_POINT, const char* const title=NULL); + + /** + Renders the input arrays as a 2D scatter-plot to the window + + \param[in] P is an \ref af_array or matrix with the xyz-values of the points + \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot + \param[in] title parameter is used when this function is called in grid mode + + \ingroup gfx_func_draw + */ + void scatter3(const array& P, const af::markerType marker=AF_MARKER_POINT, const char* const title=NULL); + /** Renders the input array as a histogram to the window @@ -392,8 +402,8 @@ AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array \param[in] wind is the window handle \param[in] X is an \ref af_array with the x-axis data points \param[in] Y is an \ref af_array with the y-axis data points + \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot \param[in] props is structure \ref af_cell that has the properties that are used - \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot for the current rendering. \return \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code @@ -403,9 +413,27 @@ AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array \ingroup gfx_func_draw */ -AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, const af_marker_type marker); +AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_marker_type marker, const af_cell* const props); #endif +#if AF_API_VERSION >= 32 +/** + C Interface wrapper for drawing an array as a plot + + \param[in] wind is the window handle + \param[in] P is an \ref af_array or matrix with the xyz-values of the points + \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot + \param[in] props is structure \ref af_cell that has the properties that are used + for the current rendering. + + \return \ref AF_SUCCESS if rendering is successful, otherwise an appropriate error code + is returned. + + \ingroup gfx_func_draw +*/ +AFAPI af_err af_draw_scatter3(const af_window wind, const af_array P, const af_marker_type marker, const af_cell* const props); + +#endif #if AF_API_VERSION >= 32 /** C Interface wrapper for drawing an array as a plot diff --git a/src/api/c/graphics_common.cpp b/src/api/c/graphics_common.cpp index 92346f59d1..a4132b55dd 100644 --- a/src/api/c/graphics_common.cpp +++ b/src/api/c/graphics_common.cpp @@ -19,6 +19,22 @@ using namespace std; template GLenum getGLType() { return GL_FLOAT; } +fg::MarkerType getFGMarker(const af_marker_type af_marker) { + fg::MarkerType fg_marker; + switch (af_marker) { + case AF_MARKER_NONE: fg_marker = fg::FG_NONE; break; + case AF_MARKER_POINT: fg_marker = fg::FG_POINT; break; + case AF_MARKER_CIRCLE: fg_marker = fg::FG_CIRCLE; break; + case AF_MARKER_SQUARE: fg_marker = fg::FG_SQUARE; break; + case AF_MARKER_TRIANGLE: fg_marker = fg::FG_TRIANGLE; break; + case AF_MARKER_CROSS: fg_marker = fg::FG_CROSS; break; + case AF_MARKER_PLUS: fg_marker = fg::FG_PLUS; break; + case AF_MARKER_STAR: fg_marker = fg::FG_STAR; break; + default: fg_marker = fg::FG_NONE; break; + } + return fg_marker; +} + #define INSTANTIATE_GET_FG_TYPE(T, ForgeEnum)\ template<> fg::dtype getGLType() { return ForgeEnum; } @@ -181,7 +197,7 @@ fg::Plot* ForgeManager::getPlot(int nPoints, fg::dtype dtype, fg::PlotType ptype return mPltMap[key]; } -fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype type) +fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype) { /* nPoints needs to fall in the range of [0, 2^48] * for the ForgeManager to correctly retrieve @@ -189,11 +205,12 @@ fg::Plot3* ForgeManager::getPlot3(int nPoints, fg::dtype type) * is a limitation on how big of an plot graph can be rendered * using arrayfire graphics funtionality */ assert(nPoints <= 2ll<<48); - long long key = ((nPoints & _48BIT) << 48) | (type & _16BIT); + long long key = ((nPoints & _48BIT) << 48); + key |= (((((dtype & 0x000F) << 12) | (ptype & 0x000F)) << 8) | (mtype & 0x000F)); Plt3MapIter iter = mPlt3Map.find(key); if (iter==mPlt3Map.end()) { - fg::Plot3* temp = new fg::Plot3(nPoints, type); + fg::Plot3* temp = new fg::Plot3(nPoints, dtype, ptype, mtype); mPlt3Map[key] = temp; } diff --git a/src/api/c/graphics_common.hpp b/src/api/c/graphics_common.hpp index caadb88cd9..8c7607f313 100644 --- a/src/api/c/graphics_common.hpp +++ b/src/api/c/graphics_common.hpp @@ -30,6 +30,7 @@ GLenum glForceErrorCheck(const char *msg, const char* file, int line); #define ForceCheckGL(msg) glForceErrorCheck(msg, __FILE__, __LINE__) #define CheckGLSkip(msg) glErrorSkip (msg, __FILE__, __LINE__) +fg::MarkerType getFGMarker(const af_marker_type af_marker); namespace graphics { @@ -83,7 +84,7 @@ class ForgeManager fg::Window* getMainWindow(const bool dontCreate=false); fg::Image* getImage(int w, int h, fg::ChannelFormat mode, fg::dtype type); fg::Plot* getPlot(int nPoints, fg::dtype dtype, fg::PlotType ptype, fg::MarkerType mtype); - fg::Plot3* getPlot3(int nPoints, fg::dtype type); + fg::Plot3* getPlot3(int nPoints, fg::dtype dtype,fg::PlotType ptype, fg::MarkerType mtype); fg::Histogram* getHistogram(int nBins, fg::dtype type); fg::Surface* getSurface(int nX, int nY, fg::dtype type); diff --git a/src/api/c/plot.cpp b/src/api/c/plot.cpp index f2740305ea..c58a894d31 100644 --- a/src/api/c/plot.cpp +++ b/src/api/c/plot.cpp @@ -111,19 +111,8 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co return plotWrapper(wind, X, Y, props); } -af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, const af::markerType af_marker) +af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_marker_type af_marker, const af_cell* const props) { - fg::MarkerType fg_marker; - switch(af_marker){ - case AF_MARKER_NONE: fg_marker = fg::FG_NONE; break; - case AF_MARKER_POINT: fg_marker = fg::FG_POINT; break; - case AF_MARKER_CIRCLE: fg_marker = fg::FG_CIRCLE; break; - case AF_MARKER_SQUARE: fg_marker = fg::FG_SQUARE; break; - case AF_MARKER_TRIANGLE: fg_marker = fg::FG_TRIANGLE; break; - case AF_MARKER_CROSS: fg_marker = fg::FG_CROSS; break; - case AF_MARKER_PLUS: fg_marker = fg::FG_PLUS; break; - case AF_MARKER_STAR: fg_marker = fg::FG_STAR; break; - default: fg_marker = fg::FG_NONE; break; - } + fg::MarkerType fg_marker = getFGMarker(af_marker); return plotWrapper(wind, X, Y, props, fg::FG_SCATTER, fg_marker); } diff --git a/src/api/c/plot3.cpp b/src/api/c/plot3.cpp index 473bce0b96..4d311058ba 100644 --- a/src/api/c/plot3.cpp +++ b/src/api/c/plot3.cpp @@ -30,7 +30,7 @@ using namespace detail; using namespace graphics; template -fg::Plot3* setup_plot3(const af_array P) +fg::Plot3* setup_plot3(const af_array P, fg::PlotType ptype, fg::MarkerType mtype) { Array pIn = getArray(P); ArrayInfo Pinfo = getInfo(P); @@ -58,7 +58,7 @@ fg::Plot3* setup_plot3(const af_array P) } ForgeManager& fgMngr = ForgeManager::getInstance(); - fg::Plot3* plot3 = fgMngr.getPlot3(P_dims.elements()/3, getGLType()); + fg::Plot3* plot3 = fgMngr.getPlot3(P_dims.elements()/3, getGLType(), ptype, mtype); plot3->setColor(1.0, 0.0, 0.0); plot3->setAxesLimits(max[0], min[0], max[1], min[1], @@ -74,7 +74,7 @@ fg::Plot3* setup_plot3(const af_array P) } #endif -af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props) +af_err plot3Wrapper(const af_window wind, const af_array P, const af_cell* const props, const fg::PlotType type=fg::FG_LINE, const fg::MarkerType marker=fg::FG_NONE) { #if defined(WITH_GRAPHICS) if(wind==0) { @@ -91,12 +91,12 @@ af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* cons fg::Plot3* plot3 = NULL; switch(Ptype) { - case f32: plot3 = setup_plot3(P); break; - case s32: plot3 = setup_plot3(P); break; - case u32: plot3 = setup_plot3(P); break; - case s16: plot3 = setup_plot3(P); break; - case u16: plot3 = setup_plot3(P); break; - case u8 : plot3 = setup_plot3(P); break; + case f32: plot3 = setup_plot3(P, type, marker); break; + case s32: plot3 = setup_plot3(P, type, marker); break; + case u32: plot3 = setup_plot3(P, type, marker); break; + case s16: plot3 = setup_plot3(P, type, marker); break; + case u16: plot3 = setup_plot3(P, type, marker); break; + case u8 : plot3 = setup_plot3(P, type, marker); break; default: TYPE_ERROR(1, Ptype); } @@ -111,3 +111,14 @@ af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* cons return AF_ERR_NO_GFX; #endif } + +af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props) +{ + return plot3Wrapper(wind, P, props); +} + +af_err af_draw_scatter3(const af_window wind, const af_array P, const af_marker_type af_marker, const af_cell* const props) +{ + fg::MarkerType fg_marker = getFGMarker(af_marker); + return plot3Wrapper(wind, P, props, fg::FG_SCATTER, fg_marker); +} diff --git a/src/api/cpp/graphics.cpp b/src/api/cpp/graphics.cpp index cb9b0803e7..162bacb4ab 100644 --- a/src/api/cpp/graphics.cpp +++ b/src/api/cpp/graphics.cpp @@ -82,7 +82,13 @@ void Window::plot(const array& X, const array& Y, const char* const title) void Window::scatter(const array& X, const array& Y, af::markerType marker, const char* const title) { af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; - AF_THROW(af_draw_scatter(get(), X.get(), Y.get(), &temp, marker)); + AF_THROW(af_draw_scatter(get(), X.get(), Y.get(), marker, &temp)); +} + +void Window::scatter3(const array& P, af::markerType marker, const char* const title) +{ + af_cell temp{_r, _c, title, AF_COLORMAP_DEFAULT}; + AF_THROW(af_draw_scatter3(get(), P.get(), marker, &temp)); } void Window::plot3(const array& P, const char* const title) diff --git a/src/api/unified/graphics.cpp b/src/api/unified/graphics.cpp index 596429318f..2895cc7afc 100644 --- a/src/api/unified/graphics.cpp +++ b/src/api/unified/graphics.cpp @@ -44,10 +44,16 @@ af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, co return CALL(wind, X, Y, props); } -af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, const af_marker_type marker) +af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_marker_type marker, const af_cell* const props) { CHECK_ARRAYS(X, Y); - return CALL(wind, X, Y, props, marker); + return CALL(wind, X, Y, marker, props); +} + +af_err af_draw_scatter3(const af_window wind, const af_array P, const af_marker_type marker, const af_cell* const props) +{ + CHECK_ARRAYS(P); + return CALL(wind, P, marker, props); } af_err af_draw_plot3(const af_window wind, const af_array P, const af_cell* const props) From 14e9d3180ecc4dde44bf66dba743524408030ba7 Mon Sep 17 00:00:00 2001 From: pradeep Date: Tue, 24 Nov 2015 13:29:57 -0500 Subject: [PATCH 034/288] Converted resize & shift cpu fns to async calls --- src/backend/cpu/resize.cpp | 348 ++++++++++++++++++------------------- src/backend/cpu/shift.cpp | 60 ++++--- 2 files changed, 205 insertions(+), 203 deletions(-) diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp index 8c4da58934..160ed46c0d 100644 --- a/src/backend/cpu/resize.cpp +++ b/src/backend/cpu/resize.cpp @@ -14,209 +14,205 @@ #include #include #include +#include +#include namespace cpu { - /** - * noop function for round to avoid compilation - * issues due to lack of this function in C90 based - * compilers, it is only present in C99 and C++11 - * - * This is not a full fledged implementation, this function - * is to be used only for positive numbers, i m using it here - * for calculating dimensions of arrays - */ - dim_t round2int(float value) - { - return (dim_t)(value+0.5f); - } - - using std::conditional; - using std::is_same; +/** + * noop function for round to avoid compilation + * issues due to lack of this function in C90 based + * compilers, it is only present in C99 and C++11 + * + * This is not a full fledged implementation, this function + * is to be used only for positive numbers, i m using it here + * for calculating dimensions of arrays + */ +dim_t round2int(float value) +{ + return (dim_t)(value+0.5f); +} - template - using wtype_t = typename conditional::value, double, float>::type; +using std::conditional; +using std::is_same; - template - using vtype_t = typename conditional::value, - T, wtype_t - >::type; +template +using wtype_t = typename conditional::value, double, float>::type; - template - struct resize_op - { - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - return; - } - }; +template +using vtype_t = typename conditional::value, + T, wtype_t + >::type; - template - struct resize_op +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) { - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0])); - dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1])); - - if (i_x >= idims[0]) i_x = idims[0] - 1; - if (i_y >= idims[1]) i_y = idims[1] - 1; - - dim_t i_off = i_y * istrides[1] + i_x; - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wost = w * ostrides[3]; - dim_t wist = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; - } - } - } - }; + return; + } +}; - template - struct resize_op +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) { - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - float f_x = (float)x / (odims[0] / (float)idims[0]); - float f_y = (float)y / (odims[1] / (float)idims[1]); - - dim_t i1_x = floor(f_x); - dim_t i1_y = floor(f_y); - - if (i1_x >= idims[0]) i1_x = idims[0] - 1; - if (i1_y >= idims[1]) i1_y = idims[1] - 1; - - float b = f_x - i1_x; - float a = f_y - i1_y; - - dim_t i2_x = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1); - dim_t i2_y = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1); - - typedef typename dtype_traits::base_type BT; - typedef wtype_t WT; - typedef vtype_t VT; - - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wst = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - dim_t zst = z * istrides[2]; - dim_t channel_off = zst + wst; - VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off]; - VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off]; - VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off]; - VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off]; - - outPtr[o_off + z * ostrides[2] + w * ostrides[3]] = - scalar((1.0f - a) * (1.0f - b)) * p1 + - scalar(( a ) * (1.0f - b)) * p2 + - scalar((1.0f - a) * ( b )) * p3 + - scalar(( a ) * ( b )) * p4; - } + // Compute Indices + dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0])); + dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1])); + + if (i_x >= idims[0]) i_x = idims[0] - 1; + if (i_y >= idims[1]) i_y = idims[1] - 1; + + dim_t i_off = i_y * istrides[1] + i_x; + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wost = w * ostrides[3]; + dim_t wist = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; } } - }; + } +}; - template - struct resize_op +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) { - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - dim_t i_x = floor((float)x / (odims[0] / (float)idims[0])); - dim_t i_y = floor((float)y / (odims[1] / (float)idims[1])); - - if (i_x >= idims[0]) i_x = idims[0] - 1; - if (i_y >= idims[1]) i_y = idims[1] - 1; - - dim_t i_off = i_y * istrides[1] + i_x; - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wost = w * ostrides[3]; - dim_t wist = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; - } + // Compute Indices + float f_x = (float)x / (odims[0] / (float)idims[0]); + float f_y = (float)y / (odims[1] / (float)idims[1]); + + dim_t i1_x = floor(f_x); + dim_t i1_y = floor(f_y); + + if (i1_x >= idims[0]) i1_x = idims[0] - 1; + if (i1_y >= idims[1]) i1_y = idims[1] - 1; + + float b = f_x - i1_x; + float a = f_y - i1_y; + + dim_t i2_x = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1); + dim_t i2_y = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1); + + typedef typename dtype_traits::base_type BT; + typedef wtype_t WT; + typedef vtype_t VT; + + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wst = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + dim_t zst = z * istrides[2]; + dim_t channel_off = zst + wst; + VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off]; + VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off]; + VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off]; + VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off]; + + outPtr[o_off + z * ostrides[2] + w * ostrides[3]] = + scalar((1.0f - a) * (1.0f - b)) * p1 + + scalar(( a ) * (1.0f - b)) * p2 + + scalar((1.0f - a) * ( b )) * p3 + + scalar(( a ) * ( b )) * p4; } } - }; + } +}; - template - void resize_(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides) +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) { - resize_op op; - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y); + // Compute Indices + dim_t i_x = floor((float)x / (odims[0] / (float)idims[0])); + dim_t i_y = floor((float)y / (odims[1] / (float)idims[1])); + + if (i_x >= idims[0]) i_x = idims[0] - 1; + if (i_y >= idims[1]) i_y = idims[1] - 1; + + dim_t i_off = i_y * istrides[1] + i_x; + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wost = w * ostrides[3]; + dim_t wist = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; } } } +}; - template - Array resize(const Array &in, const dim_t odim0, const dim_t odim1, - const af_interp_type method) - { - af::dim4 idims = in.dims(); - af::dim4 odims(odim0, odim1, idims[2], idims[3]); - - // Create output placeholder - Array outArray = createValueArray(odims, (T)0); - - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - af::dim4 ostrides = outArray.strides(); - af::dim4 istrides = in.strides(); - - switch(method) { - case AF_INTERP_NEAREST: - resize_(outPtr, inPtr, odims, idims, ostrides, istrides); - break; - case AF_INTERP_BILINEAR: - resize_(outPtr, inPtr, odims, idims, ostrides, istrides); - break; - case AF_INTERP_LOWER: - resize_(outPtr, inPtr, odims, idims, ostrides, istrides); - break; - default: - break; +template +void resize_(Array out, const Array in) +{ + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + const T *inPtr = in.get(); + T *outPtr = out.get(); + af::dim4 ostrides = out.strides(); + af::dim4 istrides = in.strides(); + + resize_op op; + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y); } - return outArray; } +} +template +Array resize(const Array &in, const dim_t odim0, const dim_t odim1, + const af_interp_type method) +{ + af::dim4 idims = in.dims(); + af::dim4 odims(odim0, odim1, idims[2], idims[3]); + // Create output placeholder + Array out = createValueArray(odims, (T)0); + out.eval(); + in.eval(); + + switch(method) { + case AF_INTERP_NEAREST: + getQueue().enqueue(resize_, out, in); break; + case AF_INTERP_BILINEAR: + getQueue().enqueue(resize_, out, in); break; + case AF_INTERP_LOWER: + getQueue().enqueue(resize_, out, in); break; + default: break; + } + return out; +} -#define INSTANTIATE(T) \ +#define INSTANTIATE(T) \ template Array resize (const Array &in, const dim_t odim0, const dim_t odim1, \ const af_interp_type method); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) } diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp index 05cac4c678..6a2b939cca 100644 --- a/src/backend/cpu/shift.cpp +++ b/src/backend/cpu/shift.cpp @@ -12,27 +12,32 @@ #include #include #include +#include +#include namespace cpu { - static inline dim_t simple_mod(const dim_t i, const dim_t dim) - { - return (i < dim) ? i : (i - dim); - } +static inline dim_t simple_mod(const dim_t i, const dim_t dim) +{ + return (i < dim) ? i : (i - dim); +} - template - Array shift(const Array &in, const int sdims[4]) - { - const af::dim4 iDims = in.dims(); - af::dim4 oDims = iDims; +template +Array shift(const Array &in, const int sdims[4]) +{ + Array out = createEmptyArray(in.dims()); + out.eval(); + in.eval(); + const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]); - Array out = createEmptyArray(oDims); + auto func = [=] (Array out, const Array in, const af::dim4 sdims) { T* outPtr = out.get(); const T* inPtr = in.get(); - const af::dim4 ist = in.strides(); - const af::dim4 ost = out.strides(); + const af::dim4 oDims = out.dims(); + const af::dim4 ist = in.strides(); + const af::dim4 ost = out.strides(); int sdims_[4]; // Need to do this because we are mapping output to input in the kernel @@ -65,24 +70,25 @@ namespace cpu } } } + }; + getQueue().enqueue(func, out, in, temp); - return out; - } + return out; +} #define INSTANTIATE(T) \ template Array shift(const Array &in, const int sdims[4]); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) - +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) } From 3c2bc65b12fe04c455837790cbba27b2417bc9a1 Mon Sep 17 00:00:00 2001 From: pradeep Date: Tue, 24 Nov 2015 14:37:14 -0500 Subject: [PATCH 035/288] convert select & rotate cpu fns to async calls --- src/backend/cpu/rotate.cpp | 23 ++++++++------ src/backend/cpu/select.cpp | 64 ++++++++++++++++++++++++-------------- 2 files changed, 53 insertions(+), 34 deletions(-) diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp index a4af64b669..01ec96228c 100644 --- a/src/backend/cpu/rotate.cpp +++ b/src/backend/cpu/rotate.cpp @@ -12,15 +12,22 @@ #include #include #include +#include +#include #include "transform_interp.hpp" namespace cpu { template - void rotate_(T *out, const T *in, const float theta, - const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides) + void rotate_(Array output, const Array input, const float theta) { + const af::dim4 odims = output.dims(); + const af::dim4 idims = input.dims(); + const af::dim4 ostrides = output.strides(); + const af::dim4 istrides = input.strides(); + + const T* in = input.get(); + T* out = output.get(); dim_t nimages = idims[2]; void (*t_fn)(T *, const T *, const float *, const af::dim4 &, @@ -77,20 +84,16 @@ namespace cpu const af_interp_type method) { Array out = createEmptyArray(odims); - const af::dim4 idims = in.dims(); switch(method) { case AF_INTERP_NEAREST: - rotate_ - (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides()); + getQueue().enqueue(rotate_, out, in, theta); break; case AF_INTERP_BILINEAR: - rotate_ - (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides()); + getQueue().enqueue(rotate_, out, in, theta); break; case AF_INTERP_LOWER: - rotate_ - (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides()); + getQueue().enqueue(rotate_, out, in, theta); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp index 7b2cc81735..4a219eda04 100644 --- a/src/backend/cpu/select.cpp +++ b/src/backend/cpu/select.cpp @@ -10,14 +10,22 @@ #include #include #include +#include +#include using af::dim4; namespace cpu { - template - void select(Array &out, const Array &cond, const Array &a, const Array &b) - { + +template +void select(Array &out, const Array &cond, const Array &a, const Array &b) +{ + out.eval(); + cond.eval(); + a.eval(); + b.eval(); + auto func = [=] (Array out, const Array cond, const Array a, const Array b) { dim4 adims = a.dims(); dim4 astrides = a.strides(); dim4 bdims = b.dims(); @@ -30,13 +38,13 @@ namespace cpu dim4 ostrides = out.strides(); bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1], - adims[2] == odims[2], adims[3] == odims[3]}; + adims[2] == odims[2], adims[3] == odims[3]}; bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1], - bdims[2] == odims[2], bdims[3] == odims[3]}; + bdims[2] == odims[2], bdims[3] == odims[3]}; bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1], - cdims[2] == odims[2], cdims[3] == odims[3]}; + cdims[2] == odims[2], cdims[3] == odims[3]}; const T *aptr = a.get(); const T *bptr = b.get(); @@ -75,11 +83,17 @@ namespace cpu } } } - } + }; + getQueue().enqueue(func, out, cond, a, b); +} - template - void select_scalar(Array &out, const Array &cond, const Array &a, const double &b) - { +template +void select_scalar(Array &out, const Array &cond, const Array &a, const double &b) +{ + out.eval(); + cond.eval(); + a.eval(); + auto func = [=] (Array out, const Array cond, const Array a, const double b) { dim4 astrides = a.strides(); dim4 cstrides = cond.strides(); @@ -115,8 +129,9 @@ namespace cpu } } } - } - + }; + getQueue().enqueue(func, out, cond, a, b); +} #define INSTANTIATE(T) \ template void select(Array &out, const Array &cond, \ @@ -130,16 +145,17 @@ namespace cpu const Array &a, \ const double &b); \ - INSTANTIATE(float ) - INSTANTIATE(double ) - INSTANTIATE(cfloat ) - INSTANTIATE(cdouble) - INSTANTIATE(int ) - INSTANTIATE(uint ) - INSTANTIATE(intl ) - INSTANTIATE(uintl ) - INSTANTIATE(char ) - INSTANTIATE(uchar ) - INSTANTIATE(short ) - INSTANTIATE(ushort ) +INSTANTIATE(float ) +INSTANTIATE(double ) +INSTANTIATE(cfloat ) +INSTANTIATE(cdouble) +INSTANTIATE(int ) +INSTANTIATE(uint ) +INSTANTIATE(intl ) +INSTANTIATE(uintl ) +INSTANTIATE(char ) +INSTANTIATE(uchar ) +INSTANTIATE(short ) +INSTANTIATE(ushort ) + } From 258d57364178a49ce8b60add412b2efd99a1633a Mon Sep 17 00:00:00 2001 From: pradeep Date: Tue, 24 Nov 2015 14:52:12 -0500 Subject: [PATCH 036/288] Convert morph & range cpu fns to async calls --- src/backend/cpu/morph.cpp | 216 ++++++++++++++++++++------------------ src/backend/cpu/range.cpp | 116 ++++++++++---------- 2 files changed, 172 insertions(+), 160 deletions(-) diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp index eb2e1de339..c64d09be30 100644 --- a/src/backend/cpu/morph.cpp +++ b/src/backend/cpu/morph.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include using af::dim4; @@ -31,108 +33,41 @@ static inline unsigned getIdx(const dim4 &strides, template Array morph(const Array &in, const Array &mask) { - const dim4 dims = in.dims(); - const dim4 window = mask.dims(); - const dim_t R0 = window[0]/2; - const dim_t R1 = window[1]/2; - const dim4 istrides = in.strides(); - const dim4 fstrides = mask.strides(); - - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); - - T* outData = out.get(); - const T* inData = in.get(); - const T* filter = mask.get(); - - for(dim_t b3=0; b3 (T)0) && offi>=0 && offj>=0 && offi -Array morph3d(const Array &in, const Array &mask) -{ - const dim4 dims = in.dims(); - const dim4 window = mask.dims(); - const dim_t R0 = window[0]/2; - const dim_t R1 = window[1]/2; - const dim_t R2 = window[2]/2; - const dim4 istrides = in.strides(); - const dim4 fstrides = mask.strides(); - const dim_t bCount = dims[3]; - - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); - - T* outData = out.get(); - const T* inData = in.get(); - const T* filter = mask.get(); - - for(dim_t batchId=0; batchId out = createEmptyArray(in.dims()); + + auto func = [=] (Array out, const Array in, const Array mask) { + const dim4 ostrides = out.strides(); + const dim4 istrides = in.strides(); + const dim4 fstrides = mask.strides(); + const dim4 dims = in.dims(); + const dim4 window = mask.dims(); + T* outData = out.get(); + const T* inData = in.get(); + const T* filter = mask.get(); + const dim_t R0 = window[0]/2; + const dim_t R1 = window[1]/2; + + for(dim_t b3=0; b3 (T)0) && offi>=0 && offj>=0 && offk>=0 && - offi (T)0) && offi>=0 && offj>=0 && offi morph3d(const Array &in, const Array &mask) } } // window 1st dimension loop ends here - } // window 1st dimension loop ends here - }// filter window loop ends here - - outData[ getIdx(ostrides, i, j, k) ] = filterResult; - } //1st dimension loop ends here - } // 2nd dimension loop ends here - } // 3rd dimension loop ends here - // next iteration will be next batch if any - outData += ostrides[3]; - inData += istrides[3]; - } + } // filter window loop ends here + + outData[ getIdx(ostrides, i, j) ] = filterResult; + } //1st dimension loop ends here + } // 2nd dimension loop ends here + + // next iteration will be next batch if any + outData += ostrides[2]; + inData += istrides[2]; + } + } + }; + getQueue().enqueue(func, out, in, mask); + + return out; +} + +template +Array morph3d(const Array &in, const Array &mask) +{ + Array out = createEmptyArray(in.dims()); + + auto func = [=] (Array out, const Array in, const Array mask) { + const dim4 dims = in.dims(); + const dim4 window = mask.dims(); + const dim_t R0 = window[0]/2; + const dim_t R1 = window[1]/2; + const dim_t R2 = window[2]/2; + const dim4 istrides = in.strides(); + const dim4 fstrides = mask.strides(); + const dim_t bCount = dims[3]; + const dim4 ostrides = out.strides(); + T* outData = out.get(); + const T* inData = in.get(); + const T* filter = mask.get(); + + for(dim_t batchId=0; batchId (T)0) && offi>=0 && offj>=0 && offk>=0 && + offi #include #include +#include +#include namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - template - void range(T *out, const dim4 &dims, const dim4 &strides) - { - for(dim_t w = 0; w < dims[3]; w++) { - dim_t offW = w * strides[3]; - for(dim_t z = 0; z < dims[2]; z++) { - dim_t offWZ = offW + z * strides[2]; - for(dim_t y = 0; y < dims[1]; y++) { - dim_t offWZY = offWZ + y * strides[1]; - for(dim_t x = 0; x < dims[0]; x++) { - dim_t id = offWZY + x; - if(dim == 0) { - out[id] = x; - } else if(dim == 1) { - out[id] = y; - } else if(dim == 2) { - out[id] = z; - } else if(dim == 3) { - out[id] = w; - } +/////////////////////////////////////////////////////////////////////////// +// Kernel Functions +/////////////////////////////////////////////////////////////////////////// +template +void range(Array output) +{ + T* out = output.get(); + + const dim4 dims = output.dims(); + const dim4 strides = output.strides(); + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + if(dim == 0) { + out[id] = x; + } else if(dim == 1) { + out[id] = y; + } else if(dim == 2) { + out[id] = z; + } else if(dim == 3) { + out[id] = w; } } } } } +} - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - Array range(const dim4& dims, const int seq_dim) - { - // Set dimension along which the sequence should be - // Other dimensions are simply tiled - int _seq_dim = seq_dim; - if(seq_dim < 0) { - _seq_dim = 0; // column wise sequence - } - - Array out = createEmptyArray(dims); - switch(_seq_dim) { - case 0: range(out.get(), out.dims(), out.strides()); break; - case 1: range(out.get(), out.dims(), out.strides()); break; - case 2: range(out.get(), out.dims(), out.strides()); break; - case 3: range(out.get(), out.dims(), out.strides()); break; - default : AF_ERROR("Invalid rep selection", AF_ERR_ARG); - } - +/////////////////////////////////////////////////////////////////////////// +// Wrapper Functions +/////////////////////////////////////////////////////////////////////////// +template +Array range(const dim4& dims, const int seq_dim) +{ + // Set dimension along which the sequence should be + // Other dimensions are simply tiled + int _seq_dim = seq_dim; + if(seq_dim < 0) { + _seq_dim = 0; // column wise sequence + } - return out; + Array out = createEmptyArray(dims); + switch(_seq_dim) { + case 0: getQueue().enqueue(range, out); break; + case 1: getQueue().enqueue(range, out); break; + case 2: getQueue().enqueue(range, out); break; + case 3: getQueue().enqueue(range, out); break; + default : AF_ERROR("Invalid rep selection", AF_ERR_ARG); } + return out; +} + #define INSTANTIATE(T) \ template Array range(const af::dim4 &dims, const int seq_dims); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(ushort) - INSTANTIATE(short) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(ushort) +INSTANTIATE(short) } From b267ffd0fb0b7b401ad4a7507af8fb3df88d002c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 30 Nov 2015 14:23:52 -0500 Subject: [PATCH 037/288] Increment version for devel to 3.3 --- CMakeModules/Version.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeModules/Version.cmake b/CMakeModules/Version.cmake index 3a474d1755..236058d154 100644 --- a/CMakeModules/Version.cmake +++ b/CMakeModules/Version.cmake @@ -2,7 +2,7 @@ # Make a version file that includes the ArrayFire version and git revision # SET(AF_VERSION_MAJOR "3") -SET(AF_VERSION_MINOR "2") +SET(AF_VERSION_MINOR "3") SET(AF_VERSION_PATCH "0") SET(AF_VERSION "${AF_VERSION_MAJOR}.${AF_VERSION_MINOR}.${AF_VERSION_PATCH}") From 65c7a23c76173b7e6b98593ec4a76ad1e1b94181 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 30 Nov 2015 14:24:06 -0500 Subject: [PATCH 038/288] Fixes for scatter --- include/af/graphics.h | 26 +++++++++++++++++--------- src/api/c/plot.cpp | 15 ++++++++++----- src/api/c/plot3.cpp | 15 ++++++++++----- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/include/af/graphics.h b/include/af/graphics.h index 129b43949f..a8b4816d95 100644 --- a/include/af/graphics.h +++ b/include/af/graphics.h @@ -179,6 +179,7 @@ class AFAPI Window { */ void plot(const array& X, const array& Y, const char* const title=NULL); +#if AF_API_VERSION >= 33 /** Renders the input arrays as a 2D scatter-plot to the window @@ -191,18 +192,23 @@ class AFAPI Window { \ingroup gfx_func_draw */ - void scatter(const array& X, const array& Y, const af::markerType marker=AF_MARKER_POINT, const char* const title=NULL); + void scatter(const array& X, const array& Y, + const af::markerType marker = AF_MARKER_POINT, const char* const title = NULL); +#endif +#if AF_API_VERSION >= 33 /** - Renders the input arrays as a 2D scatter-plot to the window + Renders the input arrays as a 3D scatter-plot to the window - \param[in] P is an \ref af_array or matrix with the xyz-values of the points + \param[in] P is an \ref af_array or matrix with the xyz-values of the points \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot \param[in] title parameter is used when this function is called in grid mode \ingroup gfx_func_draw */ - void scatter3(const array& P, const af::markerType marker=AF_MARKER_POINT, const char* const title=NULL); + void scatter3(const array& P, const af::markerType marker = AF_MARKER_POINT, + const char* const title = NULL); +#endif /** Renders the input array as a histogram to the window @@ -395,7 +401,7 @@ AFAPI af_err af_draw_image(const af_window wind, const af_array in, const af_cel */ AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array Y, const af_cell* const props); -#if AF_API_VERSION >= 32 +#if AF_API_VERSION >= 33 /** C Interface wrapper for drawing an array as a plot @@ -413,10 +419,11 @@ AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array \ingroup gfx_func_draw */ -AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, const af_marker_type marker, const af_cell* const props); +AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_array Y, + const af_marker_type marker, const af_cell* const props); #endif -#if AF_API_VERSION >= 32 +#if AF_API_VERSION >= 33 /** C Interface wrapper for drawing an array as a plot @@ -431,9 +438,10 @@ AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_ar \ingroup gfx_func_draw */ -AFAPI af_err af_draw_scatter3(const af_window wind, const af_array P, const af_marker_type marker, const af_cell* const props); - +AFAPI af_err af_draw_scatter3(const af_window wind, const af_array P, + const af_marker_type marker, const af_cell* const props); #endif + #if AF_API_VERSION >= 32 /** C Interface wrapper for drawing an array as a plot diff --git a/src/api/c/plot.cpp b/src/api/c/plot.cpp index c58a894d31..a2c026b39e 100644 --- a/src/api/c/plot.cpp +++ b/src/api/c/plot.cpp @@ -55,11 +55,9 @@ fg::Plot* setup_plot(const af_array X, const af_array Y, fg::PlotType type, fg:: return plot; } -#endif af_err plotWrapper(const af_window wind, const af_array X, const af_array Y, const af_cell* const props, fg::PlotType type=fg::FG_LINE, fg::MarkerType marker=fg::FG_NONE) { -#if defined(WITH_GRAPHICS) if(wind==0) { std::cerr<<"Not a valid window"< Date: Wed, 2 Dec 2015 10:44:06 -0500 Subject: [PATCH 039/288] Converted cpu scan function to async call Added `.eval()` calls on input Array objects inside the following functions to ensure that the inputs are computed by the time `.get()` is called on these objects to get the data values. * reduce * setUnique * setIntersection * setUnion --- src/backend/cpu/reduce.cpp | 1 + src/backend/cpu/scan.cpp | 154 ++++++++++++++++++++----------------- src/backend/cpu/set.cpp | 13 ++++ 3 files changed, 96 insertions(+), 72 deletions(-) diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp index ffe91851b1..e01f0c51f1 100644 --- a/src/backend/cpu/reduce.cpp +++ b/src/backend/cpu/reduce.cpp @@ -89,6 +89,7 @@ namespace cpu { dim4 odims = in.dims(); odims[dim] = 1; + in.eval(); Array out = createEmptyArray(odims); static const reduce_dim_func reduce_funcs[4] = { reduce_dim() diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp index 2bdda210a2..39157ca9a1 100644 --- a/src/backend/cpu/scan.cpp +++ b/src/backend/cpu/scan.cpp @@ -14,102 +14,112 @@ #include #include #include +#include +#include using af::dim4; namespace cpu { - template - struct scan_dim - { - void operator()(To *out, const dim4 ostrides, const dim4 odims, - const Ti *in , const dim4 istrides, const dim4 idims, - const int dim) - { - const int D1 = D - 1; - for (dim_t i = 0; i < odims[D1]; i++) { - scan_dim()(out + i * ostrides[D1], - ostrides, odims, - in + i * istrides[D1], - istrides, idims, - dim); - if (D1 == dim) break; - } - } - }; - template - struct scan_dim +template +struct scan_dim +{ + void operator()(Array out, dim_t outOffset, + const Array in, dim_t inOffset, + const int dim) const { - void operator()(To *out, const dim4 ostrides, const dim4 odims, - const Ti *in , const dim4 istrides, const dim4 idims, - const int dim) - { - - dim_t istride = istrides[dim]; - dim_t ostride = ostrides[dim]; - - Transform transform; - // FIXME: Change the name to something better - Binary scan; - - To out_val = scan.init(); - for (dim_t i = 0; i < idims[dim]; i++) { - To in_val = transform(in[i * istride]); - out_val = scan(in_val, out_val); - out[i * ostride] = out_val; - } + const dim4 odims = out.dims(); + const dim4 ostrides = out.strides(); + const dim4 istrides = in.strides(); + + const int D1 = D - 1; + for (dim_t i = 0; i < odims[D1]; i++) { + scan_dim func; + getQueue().enqueue(func, + out, outOffset + i * ostrides[D1], + in, inOffset + i * istrides[D1], dim); + if (D1 == dim) break; } - }; + } +}; - template - Array scan(const Array& in, const int dim) +template +struct scan_dim +{ + void operator()(Array output, dim_t outOffset, + const Array input, dim_t inOffset, + const int dim) const { - dim4 dims = in.dims(); + const Ti* in = input.get() + inOffset; + To* out= output.get()+ outOffset; - Array out = createValueArray(dims, 0); + const dim4 ostrides = output.strides(); + const dim4 istrides = input.strides(); + const dim4 idims = input.dims(); + + dim_t istride = istrides[dim]; + dim_t ostride = ostrides[dim]; + + Transform transform; + // FIXME: Change the name to something better + Binary scan; + + To out_val = scan.init(); + for (dim_t i = 0; i < idims[dim]; i++) { + To in_val = transform(in[i * istride]); + out_val = scan(in_val, out_val); + out[i * ostride] = out_val; + } + } +}; - switch (in.ndims()) { +template +Array scan(const Array& in, const int dim) +{ + dim4 dims = in.dims(); + Array out = createValueArray(dims, 0); + out.eval(); + in.eval(); + + switch (in.ndims()) { case 1: - scan_dim()(out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim); + scan_dim func1; + getQueue().enqueue(func1, out, 0, in, 0, dim); break; - case 2: - scan_dim()(out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim); + scan_dim func2; + getQueue().enqueue(func2, out, 0, in, 0, dim); break; - case 3: - scan_dim()(out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim); + scan_dim func3; + getQueue().enqueue(func3, out, 0, in, 0, dim); break; - case 4: - scan_dim()(out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim); + scan_dim func4; + getQueue().enqueue(func4, out, 0, in, 0, dim); break; - } - - return out; } + return out; +} + #define INSTANTIATE(ROp, Ti, To) \ template Array scan(const Array &in, const int dim); \ - //accum - INSTANTIATE(af_add_t, float , float ) - INSTANTIATE(af_add_t, double , double ) - INSTANTIATE(af_add_t, cfloat , cfloat ) - INSTANTIATE(af_add_t, cdouble, cdouble) - INSTANTIATE(af_add_t, int , int ) - INSTANTIATE(af_add_t, uint , uint ) - INSTANTIATE(af_add_t, intl , intl ) - INSTANTIATE(af_add_t, uintl , uintl ) - INSTANTIATE(af_add_t, char , int ) - INSTANTIATE(af_add_t, uchar , uint ) - INSTANTIATE(af_add_t, short , int ) - INSTANTIATE(af_add_t, ushort , uint ) - INSTANTIATE(af_notzero_t, char , uint ) +//accum +INSTANTIATE(af_add_t, float , float ) +INSTANTIATE(af_add_t, double , double ) +INSTANTIATE(af_add_t, cfloat , cfloat ) +INSTANTIATE(af_add_t, cdouble, cdouble) +INSTANTIATE(af_add_t, int , int ) +INSTANTIATE(af_add_t, uint , uint ) +INSTANTIATE(af_add_t, intl , intl ) +INSTANTIATE(af_add_t, uintl , uintl ) +INSTANTIATE(af_add_t, char , int ) +INSTANTIATE(af_add_t, uchar , uint ) +INSTANTIATE(af_add_t, short , int ) +INSTANTIATE(af_add_t, ushort , uint ) +INSTANTIATE(af_notzero_t, char , uint) } diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp index 3215e6d5c2..d9ca0849c0 100644 --- a/src/backend/cpu/set.cpp +++ b/src/backend/cpu/set.cpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include namespace cpu { @@ -28,6 +30,9 @@ namespace cpu Array setUnique(const Array &in, const bool is_sorted) { + in.eval(); + getQueue().sync(); + Array out = createEmptyArray(af::dim4()); if (is_sorted) out = copyArray(in); else out = sort(in, 0); @@ -46,6 +51,10 @@ namespace cpu const Array &second, const bool is_unique) { + first.eval(); + second.eval(); + getQueue().sync(); + Array uFirst = first; Array uSecond = second; @@ -78,6 +87,10 @@ namespace cpu const Array &second, const bool is_unique) { + first.eval(); + second.eval(); + getQueue().sync(); + Array uFirst = first; Array uSecond = second; From 6fc636fead1e3e14b9da100375c8a5651e2c1089 Mon Sep 17 00:00:00 2001 From: pradeep Date: Wed, 2 Dec 2015 11:40:47 -0500 Subject: [PATCH 040/288] fix for async sift cpu function Added input evaluation for sift cpu backend function to ensure the inputs have correct values before sift operation begins. --- src/backend/cpu/sift_nonfree.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp index 514a134c7d..853f407f7f 100644 --- a/src/backend/cpu/sift_nonfree.hpp +++ b/src/backend/cpu/sift_nonfree.hpp @@ -968,6 +968,7 @@ namespace cpu const float img_scale, const float feature_ratio, const bool compute_GLOH) { + in.eval(); af::dim4 idims = in.dims(); const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2) From 48a9e581d7f2b0ce1eb48171d5f1ceaaf7b4c712 Mon Sep 17 00:00:00 2001 From: pradeep Date: Wed, 2 Dec 2015 12:48:56 -0500 Subject: [PATCH 041/288] converted matchTemplate, meanShift & medfilt to async calls --- src/backend/cpu/match_template.cpp | 206 +++++++++++++++-------------- src/backend/cpu/meanshift.cpp | 174 ++++++++++++------------ src/backend/cpu/medfilt.cpp | 149 +++++++++++---------- 3 files changed, 275 insertions(+), 254 deletions(-) diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp index 4d930145d5..02a4888864 100644 --- a/src/backend/cpu/match_template.cpp +++ b/src/backend/cpu/match_template.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include using af::dim4; @@ -22,122 +24,126 @@ namespace cpu template Array match_template(const Array &sImg, const Array &tImg) { - const dim4 sDims = sImg.dims(); - const dim4 tDims = tImg.dims(); - const dim4 sStrides = sImg.strides(); - const dim4 tStrides = tImg.strides(); - - const dim_t tDim0 = tDims[0]; - const dim_t tDim1 = tDims[1]; - const dim_t sDim0 = sDims[0]; - const dim_t sDim1 = sDims[1]; - - Array out = createEmptyArray(sDims); - const dim4 oStrides = out.strides(); - - outType tImgMean = outType(0); - dim_t winNumElements = tImg.elements(); - bool needMean = mType==AF_ZSAD || mType==AF_LSAD || - mType==AF_ZSSD || mType==AF_LSSD || - mType==AF_ZNCC; - const inType * tpl = tImg.get(); - - if (needMean) { - for(dim_t tj=0; tj out = createEmptyArray(sImg.dims()); + + auto func = [=](Array out, const Array sImg, const Array tImg) { + const dim4 sDims = sImg.dims(); + const dim4 tDims = tImg.dims(); + const dim4 sStrides = sImg.strides(); + const dim4 tStrides = tImg.strides(); + + const dim_t tDim0 = tDims[0]; + const dim_t tDim1 = tDims[1]; + const dim_t sDim0 = sDims[0]; + const dim_t sDim1 = sDims[1]; + + const dim4 oStrides = out.strides(); + + outType tImgMean = outType(0); + dim_t winNumElements = tImg.elements(); + bool needMean = mType==AF_ZSAD || mType==AF_LSAD || + mType==AF_ZSSD || mType==AF_LSSD || + mType==AF_ZNCC; + const inType * tpl = tImg.get(); + + if (needMean) { + for(dim_t tj=0; tj #include #include +#include +#include using af::dim4; using std::vector; @@ -31,117 +33,123 @@ inline dim_t clamp(dim_t a, dim_t mn, dim_t mx) template Array meanshift(const Array &in, const float &s_sigma, const float &c_sigma, const unsigned iter) { - const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); + Array out = createEmptyArray(in.dims()); - const dim_t bCount = (is_color ? 1 : dims[2]); - const dim_t channels = (is_color ? dims[2] : 1); + auto func = [=] (Array out, const Array in, const float s_sigma, + const float c_sigma, const unsigned iter) { + const dim4 dims = in.dims(); + const dim4 istrides = in.strides(); + const dim4 ostrides = out.strides(); - // clamp spatical and chromatic sigma's - float space_ = std::min(11.5f, s_sigma); - const dim_t radius = std::max((int)(space_ * 1.5f), 1); - const float cvar = c_sigma*c_sigma; + const dim_t bCount = (is_color ? 1 : dims[2]); + const dim_t channels = (is_color ? dims[2] : 1); - vector means; - vector centers; - vector tmpclrs; - means.reserve(channels); - centers.reserve(channels); - tmpclrs.reserve(channels); + // clamp spatical and chromatic sigma's + float space_ = std::min(11.5f, s_sigma); + const dim_t radius = std::max((int)(space_ * 1.5f), 1); + const float cvar = c_sigma*c_sigma; - T *outData = out.get(); - const T * inData = in.get(); + vector means; + vector centers; + vector tmpclrs; + means.reserve(channels); + centers.reserve(channels); + tmpclrs.reserve(channels); - for(dim_t b3=0; b31 - // i.e for color images where batch is along fourth dimension - centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]]; - } + dim_t i_in_off = i*istrides[0]; + dim_t i_out_off = i*ostrides[0]; - // scope of meanshift iterationd begin - for(unsigned it=0; it1 + // i.e for color images where batch is along fourth dimension + centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]]; + } - int count = 0; - int shift_x = 0; - int shift_y = 0; + // scope of meanshift iterationd begin + for(unsigned it=0; it #include #include +#include +#include using af::dim4; @@ -23,114 +25,119 @@ namespace cpu template Array medfilt(const Array &in, dim_t w_len, dim_t w_wid) { - const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - Array out = createEmptyArray(dims); - const dim4 ostrides = out.strides(); + Array out = createEmptyArray(in.dims()); - std::vector wind_vals; - wind_vals.reserve(w_len*w_wid); + auto func = [=] (Array out, const Array in, + dim_t w_len, dim_t w_wid) { + const dim4 dims = in.dims(); + const dim4 istrides = in.strides(); + const dim4 ostrides = out.strides(); - T const * in_ptr = in.get(); - T * out_ptr = out.get(); + std::vector wind_vals; + wind_vals.reserve(w_len*w_wid); - for(int b3=0; b3<(int)dims[3]; b3++) { + T const * in_ptr = in.get(); + T * out_ptr = out.get(); - for(int b2=0; b2<(int)dims[2]; b2++) { + for(int b3=0; b3<(int)dims[3]; b3++) { - for(int col=0; col<(int)dims[1]; col++) { + for(int b2=0; b2<(int)dims[2]; b2++) { - int ocol_off = col*ostrides[1]; + for(int col=0; col<(int)dims[1]; col++) { - for(int row=0; row<(int)dims[0]; row++) { + int ocol_off = col*ostrides[1]; - wind_vals.clear(); + for(int row=0; row<(int)dims[0]; row++) { - for(int wj=0; wj<(int)w_wid; ++wj) { + wind_vals.clear(); - bool isColOff = false; + for(int wj=0; wj<(int)w_wid; ++wj) { - int im_col = col + wj-w_wid/2; - int im_coff; - switch(pad) { - case AF_PAD_ZERO: - im_coff = im_col * istrides[1]; - if (im_col < 0 || im_col>=(int)dims[1]) - isColOff = true; - break; - case AF_PAD_SYM: - { - if (im_col < 0) { - im_col *= -1; - isColOff = true; - } + bool isColOff = false; - if (im_col>=(int)dims[1]) { - im_col = 2*((int)dims[1]-1) - im_col; - isColOff = true; - } - - im_coff = im_col * istrides[1]; - } - break; - } - - for(int wi=0; wi<(int)w_len; ++wi) { - - bool isRowOff = false; - - int im_row = row + wi-w_len/2; - int im_roff; + int im_col = col + wj-w_wid/2; + int im_coff; switch(pad) { case AF_PAD_ZERO: - im_roff = im_row * istrides[0]; - if (im_row < 0 || im_row>=(int)dims[0]) - isRowOff = true; + im_coff = im_col * istrides[1]; + if (im_col < 0 || im_col>=(int)dims[1]) + isColOff = true; break; case AF_PAD_SYM: { - if (im_row < 0) { - im_row *= -1; - isRowOff = true; + if (im_col < 0) { + im_col *= -1; + isColOff = true; } - if (im_row>=(int)dims[0]) { - im_row = 2*((int)dims[0]-1) - im_row; - isRowOff = true; + if (im_col>=(int)dims[1]) { + im_col = 2*((int)dims[1]-1) - im_col; + isColOff = true; } - im_roff = im_row * istrides[0]; + im_coff = im_col * istrides[1]; } break; } - if(isRowOff || isColOff) { + for(int wi=0; wi<(int)w_len; ++wi) { + + bool isRowOff = false; + + int im_row = row + wi-w_len/2; + int im_roff; switch(pad) { case AF_PAD_ZERO: - wind_vals.push_back(0); + im_roff = im_row * istrides[0]; + if (im_row < 0 || im_row>=(int)dims[0]) + isRowOff = true; break; case AF_PAD_SYM: - wind_vals.push_back(in_ptr[im_coff+im_roff]); + { + if (im_row < 0) { + im_row *= -1; + isRowOff = true; + } + + if (im_row>=(int)dims[0]) { + im_row = 2*((int)dims[0]-1) - im_row; + isRowOff = true; + } + + im_roff = im_row * istrides[0]; + } break; } - } else - wind_vals.push_back(in_ptr[im_coff+im_roff]); + + if(isRowOff || isColOff) { + switch(pad) { + case AF_PAD_ZERO: + wind_vals.push_back(0); + break; + case AF_PAD_SYM: + wind_vals.push_back(in_ptr[im_coff+im_roff]); + break; + } + } else + wind_vals.push_back(in_ptr[im_coff+im_roff]); + } } - } - std::stable_sort(wind_vals.begin(),wind_vals.end()); - int off = wind_vals.size()/2; - if (wind_vals.size()%2==0) - out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2; - else { - out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off]; + std::stable_sort(wind_vals.begin(),wind_vals.end()); + int off = wind_vals.size()/2; + if (wind_vals.size()%2==0) + out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2; + else { + out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off]; + } } } + in_ptr += istrides[2]; + out_ptr += ostrides[2]; } - in_ptr += istrides[2]; - out_ptr += ostrides[2]; } - } + }; + getQueue().enqueue(func, out, in, w_len, w_wid); return out; } From b813fd4bf2f49bc5f9af4cae62321b2945b5f129 Mon Sep 17 00:00:00 2001 From: pradeep Date: Wed, 2 Dec 2015 13:51:42 -0500 Subject: [PATCH 042/288] nearest neighbour cpu func is asyn call now --- src/backend/cpu/nearest_neighbour.cpp | 46 ++++++++++++++++++--------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp index 79d41516e3..97f0e0a8f0 100644 --- a/src/backend/cpu/nearest_neighbour.cpp +++ b/src/backend/cpu/nearest_neighbour.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include using af::dim4; @@ -90,27 +92,18 @@ struct dist_op }; template -void nearest_neighbour_(Array& idx, Array& dist, - const Array& query, const Array& train, +void nearest_neighbour_(Array idx, Array dist, + const Array query, const Array train, const uint dist_dim, const uint n_dist) { uint sample_dim = (dist_dim == 0) ? 1 : 0; const dim4 qDims = query.dims(); const dim4 tDims = train.dims(); - if (n_dist > 1) { - CPU_NOT_SUPPORTED(); - } - const unsigned distLength = qDims[dist_dim]; const unsigned nQuery = qDims[sample_dim]; const unsigned nTrain = tDims[sample_dim]; - const dim4 outDims(n_dist, nQuery); - - idx = createEmptyArray(outDims); - dist = createEmptyArray(outDims); - const T* qPtr = query.get(); const T* tPtr = train.get(); uint* iPtr = idx.get(); @@ -157,11 +150,34 @@ void nearest_neighbour(Array& idx, Array& dist, const uint dist_dim, const uint n_dist, const af_match_type dist_type) { + if (n_dist > 1) { + CPU_NOT_SUPPORTED(); + } + + query.eval(); + train.eval(); + + uint sample_dim = (dist_dim == 0) ? 1 : 0; + const dim4 qDims = query.dims(); + const dim4 outDims(n_dist, qDims[sample_dim]); + + idx = createEmptyArray(outDims); + dist = createEmptyArray(outDims); + idx.eval(); + dist.eval(); + switch(dist_type) { - case AF_SAD: nearest_neighbour_(idx, dist, query, train, dist_dim, n_dist); break; - case AF_SSD: nearest_neighbour_(idx, dist, query, train, dist_dim, n_dist); break; - case AF_SHD: nearest_neighbour_(idx, dist, query, train, dist_dim, n_dist); break; - default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED); + case AF_SAD: + getQueue().enqueue(nearest_neighbour_, idx, dist, query, train, dist_dim, n_dist); + break; + case AF_SSD: + getQueue().enqueue(nearest_neighbour_, idx, dist, query, train, dist_dim, n_dist); + break; + case AF_SHD: + getQueue().enqueue(nearest_neighbour_, idx, dist, query, train, dist_dim, n_dist); + break; + default: + AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED); } } From 35a462c08b86dcb107f6df84428adb6dea749636 Mon Sep 17 00:00:00 2001 From: pradeep Date: Wed, 2 Dec 2015 14:38:49 -0500 Subject: [PATCH 043/288] conversion of listed functions to async calls * gradient * histogram * hsv2rgb * rgb2hsv * identity * inverse * iota * lookup --- src/backend/cpu/gradient.cpp | 26 +++-- src/backend/cpu/histogram.cpp | 44 +++++--- src/backend/cpu/hsv_rgb.cpp | 192 ++++++++++++++++++---------------- src/backend/cpu/identity.cpp | 44 ++++---- src/backend/cpu/inverse.cpp | 13 ++- src/backend/cpu/iota.cpp | 85 ++++++++------- src/backend/cpu/lookup.cpp | 50 +++++---- 7 files changed, 258 insertions(+), 196 deletions(-) diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp index 8ab2fe46fc..504c02a29c 100644 --- a/src/backend/cpu/gradient.cpp +++ b/src/backend/cpu/gradient.cpp @@ -12,12 +12,20 @@ #include #include #include +#include +#include namespace cpu { - template - void gradient(Array &grad0, Array &grad1, const Array &in) - { + +template +void gradient(Array &grad0, Array &grad1, const Array &in) +{ + grad0.eval(); + grad1.eval(); + in.eval(); + + auto func = [=] (Array grad0, Array grad1, const Array in) { const af::dim4 dims = in.dims(); T *d_grad0 = grad0.get(); @@ -82,13 +90,15 @@ namespace cpu } } } - } + }; + getQueue().enqueue(func, grad0, grad1, in); +} #define INSTANTIATE(T) \ template void gradient(Array &grad0, Array &grad1, const Array &in); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) } diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp index e382a0ee87..8fb3e43544 100644 --- a/src/backend/cpu/histogram.cpp +++ b/src/backend/cpu/histogram.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include using af::dim4; @@ -21,31 +23,39 @@ namespace cpu template Array histogram(const Array &in, const unsigned &nbins, const double &minval, const double &maxval) { - float step = (maxval - minval)/(float)nbins; + in.eval(); const dim4 inDims = in.dims(); - dim4 iStrides = in.strides(); dim4 outDims = dim4(nbins,1,inDims[2],inDims[3]); Array out = createValueArray(outDims, outType(0)); - dim4 oStrides = out.strides(); - dim_t nElems = inDims[0]*inDims[1]; + out.eval(); - outType *outData = out.get(); - const inType* inData= in.get(); + auto func = [=](Array out, const Array in, + const unsigned nbins, const double minval, const double maxval) { + const float step = (maxval - minval)/(float)nbins; + const dim4 inDims = in.dims(); + const dim4 iStrides = in.strides(); + const dim4 oStrides = out.strides(); + const dim_t nElems = inDims[0]*inDims[1]; - for(dim_t b3 = 0; b3 < outDims[3]; b3++) { - for(dim_t b2 = 0; b2 < outDims[2]; b2++) { - for(dim_t i=0; i #include #include +#include +#include using af::dim4; @@ -22,54 +24,60 @@ namespace cpu template Array hsv2rgb(const Array& in) { - const dim4 dims = in.dims(); - const dim4 strides = in.strides(); - Array out = createEmptyArray(dims); - dim_t obStride = out.strides()[3]; - dim_t coff = strides[2]; - dim_t bCount = dims[3]; - - for(dim_t b=0; b out = createEmptyArray(in.dims()); + + auto func = [=](Array out, const Array in) { + const dim4 dims = in.dims(); + const dim4 strides = in.strides(); + dim_t obStride = out.strides()[3]; + dim_t coff = strides[2]; + dim_t bCount = dims[3]; + + for(dim_t b=0; b hsv2rgb(const Array& in) template Array rgb2hsv(const Array& in) { - const dim4 dims = in.dims(); - const dim4 strides = in.strides(); - Array out = createEmptyArray(dims); - dim4 oStrides = out.strides(); - dim_t bCount = dims[3]; - - for(dim_t b=0; b out = createEmptyArray(in.dims()); + + auto func = [=](Array out, const Array in) { + const dim4 dims = in.dims(); + const dim4 strides = in.strides(); + dim4 oStrides = out.strides(); + dim_t bCount = dims[3]; + + for(dim_t b=0; b #include #include +#include +#include namespace cpu { - template - Array identity(const dim4& dims) - { - Array out = createEmptyArray(dims); +template +Array identity(const dim4& dims) +{ + Array out = createEmptyArray(dims); + + auto func = [=] (Array out) { T *ptr = out.get(); const dim_t *out_dims = out.dims().get(); @@ -31,23 +35,25 @@ namespace cpu } ptr += out_dims[0] * out_dims[1]; } - return out; - } + }; + getQueue().enqueue(func, out); + + return out; +} #define INSTANTIATE_IDENTITY(T) \ template Array identity (const af::dim4 &dims); - INSTANTIATE_IDENTITY(float) - INSTANTIATE_IDENTITY(double) - INSTANTIATE_IDENTITY(cfloat) - INSTANTIATE_IDENTITY(cdouble) - INSTANTIATE_IDENTITY(int) - INSTANTIATE_IDENTITY(uint) - INSTANTIATE_IDENTITY(intl) - INSTANTIATE_IDENTITY(uintl) - INSTANTIATE_IDENTITY(char) - INSTANTIATE_IDENTITY(uchar) - INSTANTIATE_IDENTITY(short) - INSTANTIATE_IDENTITY(ushort) - +INSTANTIATE_IDENTITY(float) +INSTANTIATE_IDENTITY(double) +INSTANTIATE_IDENTITY(cfloat) +INSTANTIATE_IDENTITY(cdouble) +INSTANTIATE_IDENTITY(int) +INSTANTIATE_IDENTITY(uint) +INSTANTIATE_IDENTITY(intl) +INSTANTIATE_IDENTITY(uintl) +INSTANTIATE_IDENTITY(char) +INSTANTIATE_IDENTITY(uchar) +INSTANTIATE_IDENTITY(short) +INSTANTIATE_IDENTITY(ushort) } diff --git a/src/backend/cpu/inverse.cpp b/src/backend/cpu/inverse.cpp index 129823b963..987ba01c53 100644 --- a/src/backend/cpu/inverse.cpp +++ b/src/backend/cpu/inverse.cpp @@ -23,6 +23,8 @@ #include #include #include +#include +#include namespace cpu { @@ -48,6 +50,7 @@ INV_FUNC(getri , cdouble, z) template Array inverse(const Array &in) { + in.eval(); int M = in.dims()[0]; int N = in.dims()[1]; @@ -58,12 +61,14 @@ Array inverse(const Array &in) } Array A = copyArray(in); - Array pivot = lu_inplace(A, false); - getri_func()(AF_LAPACK_COL_MAJOR, M, - A.get(), A.strides()[1], - pivot.get()); + auto func = [=] (Array A, Array pivot, int M) { + getri_func()(AF_LAPACK_COL_MAJOR, M, + A.get(), A.strides()[1], + pivot.get()); + }; + getQueue().enqueue(func, A, pivot, M); return A; } diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp index 47bcb924e4..170b6a1570 100644 --- a/src/backend/cpu/iota.cpp +++ b/src/backend/cpu/iota.cpp @@ -14,59 +14,66 @@ #include #include #include +#include +#include using namespace std; namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - template - void iota(T *out, const dim4 &dims, const dim4 &strides, const dim4 &sdims, const dim4 &tdims) - { - for(dim_t w = 0; w < dims[3]; w++) { - dim_t offW = w * strides[3]; - T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2]; - for(dim_t z = 0; z < dims[2]; z++) { - dim_t offWZ = offW + z * strides[2]; - T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1]; - for(dim_t y = 0; y < dims[1]; y++) { - dim_t offWZY = offWZ + y * strides[1]; - T valY = valZ + (y % sdims[1]) * sdims[0]; - for(dim_t x = 0; x < dims[0]; x++) { - dim_t id = offWZY + x; - out[id] = valY + (x % sdims[0]); - } +/////////////////////////////////////////////////////////////////////////// +// Kernel Functions +/////////////////////////////////////////////////////////////////////////// +template +void iota_(Array output, const dim4 &sdims, const dim4 &tdims) +{ + const dim4 dims = output.dims(); + T* out = output.get(); + const dim4 strides = output.strides(); + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + T valY = valZ + (y % sdims[1]) * sdims[0]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + out[id] = valY + (x % sdims[0]); } } } } +} - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - Array iota(const dim4 &dims, const dim4 &tile_dims) - { - dim4 outdims = dims * tile_dims; +/////////////////////////////////////////////////////////////////////////// +// Wrapper Functions +/////////////////////////////////////////////////////////////////////////// +template +Array iota(const dim4 &dims, const dim4 &tile_dims) +{ + dim4 outdims = dims * tile_dims; - Array out = createEmptyArray(outdims); - iota(out.get(), out.dims(), out.strides(), dims, tile_dims); + Array out = createEmptyArray(outdims); - return out; - } + getQueue().enqueue(iota_, out, dims, tile_dims); + + return out; +} #define INSTANTIATE(T) \ template Array iota(const af::dim4 &dims, const af::dim4 &tile_dims); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) } diff --git a/src/backend/cpu/lookup.cpp b/src/backend/cpu/lookup.cpp index 128cc02823..0aeee4dc81 100644 --- a/src/backend/cpu/lookup.cpp +++ b/src/backend/cpu/lookup.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include namespace cpu { @@ -30,11 +32,10 @@ dim_t trimIndex(int idx, const dim_t &len) template Array lookup(const Array &input, const Array &indices, const unsigned dim) { - const dim4 iDims = input.dims(); - const dim4 iStrides = input.strides(); + input.eval(); + indices.eval(); - const in_t *inPtr = input.get(); - const idx_t *idxPtr = indices.get(); + const dim4 iDims = input.dims(); dim4 oDims(1); for (int d=0; d<4; ++d) @@ -42,35 +43,44 @@ Array lookup(const Array &input, const Array &indices, const Array out = createEmptyArray(oDims); - dim4 oStrides = out.strides(); + auto func = [=] (Array out, const Array input, + const Array indices, const unsigned dim) { + const dim4 iDims = input.dims(); + const dim4 oDims = out.dims(); + const dim4 iStrides = input.strides(); + const dim4 oStrides = out.strides(); + const in_t *inPtr = input.get(); + const idx_t *idxPtr = indices.get(); - in_t *outPtr = out.get(); + in_t *outPtr = out.get(); - for (dim_t l=0; l Date: Wed, 2 Dec 2015 16:18:11 -0500 Subject: [PATCH 044/288] converted join cpu func to async call --- src/backend/cpu/join.cpp | 373 ++++++++++++++++++++------------------- 1 file changed, 193 insertions(+), 180 deletions(-) diff --git a/src/backend/cpu/join.cpp b/src/backend/cpu/join.cpp index 78d2a51ab4..8af9c24f8d 100644 --- a/src/backend/cpu/join.cpp +++ b/src/backend/cpu/join.cpp @@ -11,241 +11,254 @@ #include #include #include +#include +#include namespace cpu { - template - void join_append(To *out, const Tx *X, const af::dim4 &offset, - const af::dim4 &odims, const af::dim4 &xdims, - const af::dim4 &ost, const af::dim4 &xst) - { - for(dim_t ow = 0; ow < xdims[3]; ow++) { - const dim_t xW = ow * xst[3]; - const dim_t oW = (ow + offset[3]) * ost[3]; - - for(dim_t oz = 0; oz < xdims[2]; oz++) { - const dim_t xZW = xW + oz * xst[2]; - const dim_t oZW = oW + (oz + offset[2]) * ost[2]; - - for(dim_t oy = 0; oy < xdims[1]; oy++) { - const dim_t xYZW = xZW + oy * xst[1]; - const dim_t oYZW = oZW + (oy + offset[1]) * ost[1]; - - for(dim_t ox = 0; ox < xdims[0]; ox++) { - const dim_t iMem = xYZW + ox; - const dim_t oMem = oYZW + (ox + offset[0]); - out[oMem] = X[iMem]; - } +template +void join_append(To *out, const Tx *X, const af::dim4 &offset, + const af::dim4 &odims, const af::dim4 &xdims, + const af::dim4 &ost, const af::dim4 &xst) +{ + for(dim_t ow = 0; ow < xdims[3]; ow++) { + const dim_t xW = ow * xst[3]; + const dim_t oW = (ow + offset[3]) * ost[3]; + + for(dim_t oz = 0; oz < xdims[2]; oz++) { + const dim_t xZW = xW + oz * xst[2]; + const dim_t oZW = oW + (oz + offset[2]) * ost[2]; + + for(dim_t oy = 0; oy < xdims[1]; oy++) { + const dim_t xYZW = xZW + oy * xst[1]; + const dim_t oYZW = oZW + (oy + offset[1]) * ost[1]; + + for(dim_t ox = 0; ox < xdims[0]; ox++) { + const dim_t iMem = xYZW + ox; + const dim_t oMem = oYZW + (ox + offset[0]); + out[oMem] = X[iMem]; } } } } +} - template - af::dim4 calcOffset(const af::dim4 dims) - { - af::dim4 offset; - offset[0] = (dim == 0) ? dims[0] : 0; - offset[1] = (dim == 1) ? dims[1] : 0; - offset[2] = (dim == 2) ? dims[2] : 0; - offset[3] = (dim == 3) ? dims[3] : 0; - return offset; - } +template +af::dim4 calcOffset(const af::dim4 dims) +{ + af::dim4 offset; + offset[0] = (dim == 0) ? dims[0] : 0; + offset[1] = (dim == 1) ? dims[1] : 0; + offset[2] = (dim == 2) ? dims[2] : 0; + offset[3] = (dim == 3) ? dims[3] : 0; + return offset; +} - template - Array join(const int dim, const Array &first, const Array &second) - { - // All dimensions except join dimension must be equal - // Compute output dims - af::dim4 odims; - af::dim4 fdims = first.dims(); - af::dim4 sdims = second.dims(); - - for(int i = 0; i < 4; i++) { - if(i == dim) { - odims[i] = fdims[i] + sdims[i]; - } else { - odims[i] = fdims[i]; - } +template +Array join(const int dim, const Array &first, const Array &second) +{ + first.eval(); + second.eval(); + + // All dimensions except join dimension must be equal + // Compute output dims + af::dim4 odims; + af::dim4 fdims = first.dims(); + af::dim4 sdims = second.dims(); + + for(int i = 0; i < 4; i++) { + if(i == dim) { + odims[i] = fdims[i] + sdims[i]; + } else { + odims[i] = fdims[i]; } + } - Array out = createEmptyArray(odims); + Array out = createEmptyArray(odims); + auto func = [=] (Array out, const Array first, const Array second) { Tx* outPtr = out.get(); const Tx* fptr = first.get(); const Ty* sptr = second.get(); af::dim4 zero(0,0,0,0); + const af::dim4 odims = out.dims(); + const af::dim4 fdims = first.dims(); + const af::dim4 sdims = second.dims(); switch(dim) { case 0: join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); + odims, fdims, out.strides(), first.strides()); join_append(outPtr, sptr, calcOffset<0>(fdims), - odims, sdims, out.strides(), second.strides()); + odims, sdims, out.strides(), second.strides()); break; case 1: join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); + odims, fdims, out.strides(), first.strides()); join_append(outPtr, sptr, calcOffset<1>(fdims), - odims, sdims, out.strides(), second.strides()); + odims, sdims, out.strides(), second.strides()); break; case 2: join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); + odims, fdims, out.strides(), first.strides()); join_append(outPtr, sptr, calcOffset<2>(fdims), - odims, sdims, out.strides(), second.strides()); + odims, sdims, out.strides(), second.strides()); break; case 3: join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); + odims, fdims, out.strides(), first.strides()); join_append(outPtr, sptr, calcOffset<3>(fdims), - odims, sdims, out.strides(), second.strides()); + odims, sdims, out.strides(), second.strides()); break; } + }; + getQueue().enqueue(func, out, first, second); - return out; - } + return out; +} - template - void join_wrapper(const int dim, Array &out, const std::vector> &inputs) - { - af::dim4 zero(0,0,0,0); - af::dim4 d = zero; - switch(dim) { - case 0: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<0>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 1: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<1>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 2: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<2>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 3: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<3>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - } +template +void join_wrapper(const int dim, Array out, const std::vector> inputs) +{ + af::dim4 zero(0,0,0,0); + af::dim4 d = zero; + switch(dim) { + case 0: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<0>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 1: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<1>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 2: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<2>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 3: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<3>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; } +} - template - Array join(const int dim, const std::vector> &inputs) - { - // All dimensions except join dimension must be equal - // Compute output dims - af::dim4 odims; - const dim_t n_arrays = inputs.size(); - std::vector idims(n_arrays); - - dim_t dim_size = 0; - for(int i = 0; i < (int)idims.size(); i++) { - idims[i] = inputs[i].dims(); - dim_size += idims[i][dim]; - } - - for(int i = 0; i < 4; i++) { - if(i == dim) { - odims[i] = dim_size; - } else { - odims[i] = idims[0][i]; - } - } +template +Array join(const int dim, const std::vector> &inputs) +{ + for (int i=0; i idims(n_arrays); - Array out = createEmptyArray(odims); + dim_t dim_size = 0; + for(int i = 0; i < (int)idims.size(); i++) { + idims[i] = inputs[i].dims(); + dim_size += idims[i][dim]; + } - switch(n_arrays) { - case 1: - join_wrapper(dim, out, inputs); - break; - case 2: - join_wrapper(dim, out, inputs); - break; - case 3: - join_wrapper(dim, out, inputs); - break; - case 4: - join_wrapper(dim, out, inputs); - break; - case 5: - join_wrapper(dim, out, inputs); - break; - case 6: - join_wrapper(dim, out, inputs); - break; - case 7: - join_wrapper(dim, out, inputs); - break; - case 8: - join_wrapper(dim, out, inputs); - break; - case 9: - join_wrapper(dim, out, inputs); - break; - case 10: - join_wrapper(dim, out, inputs); - break; + for(int i = 0; i < 4; i++) { + if(i == dim) { + odims[i] = dim_size; + } else { + odims[i] = idims[0][i]; } + } - return out; + Array out = createEmptyArray(odims); + + switch(n_arrays) { + case 1: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 2: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 3: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 4: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 5: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 6: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 7: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 8: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 9: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; + case 10: + getQueue().enqueue(join_wrapper, dim, out, inputs); + break; } + return out; +} + #define INSTANTIATE(Tx, Ty) \ template Array join(const int dim, const Array &first, const Array &second); - INSTANTIATE(float, float) - INSTANTIATE(double, double) - INSTANTIATE(cfloat, cfloat) - INSTANTIATE(cdouble, cdouble) - INSTANTIATE(int, int) - INSTANTIATE(uint, uint) - INSTANTIATE(intl, intl) - INSTANTIATE(uintl, uintl) - INSTANTIATE(uchar, uchar) - INSTANTIATE(char, char) - INSTANTIATE(ushort, ushort) - INSTANTIATE(short, short) +INSTANTIATE(float, float) +INSTANTIATE(double, double) +INSTANTIATE(cfloat, cfloat) +INSTANTIATE(cdouble, cdouble) +INSTANTIATE(int, int) +INSTANTIATE(uint, uint) +INSTANTIATE(intl, intl) +INSTANTIATE(uintl, uintl) +INSTANTIATE(uchar, uchar) +INSTANTIATE(char, char) +INSTANTIATE(ushort, ushort) +INSTANTIATE(short, short) #undef INSTANTIATE #define INSTANTIATE(T) \ template Array join(const int dim, const std::vector> &inputs); - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(ushort) - INSTANTIATE(short) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(ushort) +INSTANTIATE(short) #undef INSTANTIATE } From 0c72451eb1ac629940a76f6f57890e4cba0d6df0 Mon Sep 17 00:00:00 2001 From: pradeep Date: Wed, 2 Dec 2015 16:22:42 -0500 Subject: [PATCH 045/288] converted cpu regions function to asynchronous call --- src/backend/cpu/regions.cpp | 157 +++++++++++++++++++----------------- 1 file changed, 83 insertions(+), 74 deletions(-) diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp index b753fb5547..f7309c8dbe 100644 --- a/src/backend/cpu/regions.cpp +++ b/src/backend/cpu/regions.cpp @@ -17,6 +17,8 @@ #include #include #include +#include +#include using af::dim4; @@ -106,97 +108,104 @@ static void setUnion(LabelNode* x, LabelNode* y) template Array regions(const Array &in, af_connectivity connectivity) { - const dim4 in_dims = in.dims(); + in.eval(); // Create output placeholder - Array out = createValueArray(in_dims, (T)0); - - const char *in_ptr = in.get(); - T *out_ptr = out.get(); - - // Map labels - typedef typename std::map* > label_map_t; - typedef typename label_map_t::iterator label_map_iterator_t; - - label_map_t lmap; - - // Initial label - T label = (T)1; - - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * in_dims[0] + i; - if (in_ptr[idx] != 0) { - std::vector l; - - // Test neighbors - if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0) - l.push_back(out_ptr[j * in_dims[0] + i-1]); - if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i]); - if (connectivity == AF_CONNECTIVITY_8 && i > 0 && j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]); - if (connectivity == AF_CONNECTIVITY_8 && i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]); - - if (!l.empty()) { - T minl = l[0]; - for (size_t k = 0; k < l.size(); k++) { - minl = min(l[k], minl); - label_map_iterator_t cur_map = lmap.find(l[k]); - LabelNode *node = cur_map->second; - // Group labels of the same region under a disjoint set - for (size_t m = k+1; m < l.size(); m++) - setUnion(node, lmap.find(l[m])->second); + Array out = createValueArray(in.dims(), (T)0); + out.eval(); + + auto func = [=] (Array out, const Array in, af_connectivity connectivity) { + const dim4 in_dims = in.dims(); + const char *in_ptr = in.get(); + T *out_ptr = out.get(); + + // Map labels + typedef typename std::map* > label_map_t; + typedef typename label_map_t::iterator label_map_iterator_t; + + label_map_t lmap; + + // Initial label + T label = (T)1; + + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * in_dims[0] + i; + if (in_ptr[idx] != 0) { + std::vector l; + + // Test neighbors + if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0) + l.push_back(out_ptr[j * in_dims[0] + i-1]); + if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i]); + if (connectivity == AF_CONNECTIVITY_8 && i > 0 && + j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]); + if (connectivity == AF_CONNECTIVITY_8 && + i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]); + + if (!l.empty()) { + T minl = l[0]; + for (size_t k = 0; k < l.size(); k++) { + minl = min(l[k], minl); + label_map_iterator_t cur_map = lmap.find(l[k]); + LabelNode *node = cur_map->second; + // Group labels of the same region under a disjoint set + for (size_t m = k+1; m < l.size(); m++) + setUnion(node, lmap.find(l[m])->second); + } + // Set label to smallest neighbor label + out_ptr[idx] = minl; + } + else { + // Insert new label in map + LabelNode *node = new LabelNode(label); + lmap.insert(std::pair* >(label, node)); + out_ptr[idx] = label++; } - // Set label to smallest neighbor label - out_ptr[idx] = minl; - } - else { - // Insert new label in map - LabelNode *node = new LabelNode(label); - lmap.insert(std::pair* >(label, node)); - out_ptr[idx] = label++; } } } - } - std::set removed; + std::set removed; - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * (int)in_dims[0] + i; - if (in_ptr[idx] != 0) { - T l = out_ptr[idx]; - label_map_iterator_t cur_map = lmap.find(l); + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * (int)in_dims[0] + i; + if (in_ptr[idx] != 0) { + T l = out_ptr[idx]; + label_map_iterator_t cur_map = lmap.find(l); - if (cur_map != lmap.end()) { - LabelNode* node = cur_map->second; + if (cur_map != lmap.end()) { + LabelNode* node = cur_map->second; - LabelNode* node_root = find(node); - out_ptr[idx] = node_root->getMinLabel(); + LabelNode* node_root = find(node); + out_ptr[idx] = node_root->getMinLabel(); - // Mark removed labels (those that are part of a region - // that contains a smaller label) - if (node->getMinLabel() < l || node_root->getMinLabel() < l) - removed.insert(l); - if (node->getLabel() > node->getMinLabel()) - removed.insert(node->getLabel()); + // Mark removed labels (those that are part of a region + // that contains a smaller label) + if (node->getMinLabel() < l || node_root->getMinLabel() < l) + removed.insert(l); + if (node->getLabel() > node->getMinLabel()) + removed.insert(node->getLabel()); + } } } } - } - // Calculate final neighbors (ensure final labels are sequential) - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * (int)in_dims[0] + i; - if (out_ptr[idx] > 0) { - out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx])); + // Calculate final neighbors (ensure final labels are sequential) + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * (int)in_dims[0] + i; + if (out_ptr[idx] > 0) { + out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx])); + } } } - } + }; + getQueue().enqueue(func, out, in, connectivity); return out; } From 53de79030d346f54d7293e159441f3267747489a Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 3 Dec 2015 12:07:02 -0500 Subject: [PATCH 046/288] Removed dead code from opencl::DeviceManager class --- src/backend/opencl/platform.cpp | 5 ----- src/backend/opencl/platform.hpp | 2 -- 2 files changed, 7 deletions(-) diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 85364c4297..6f9ae99116 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -98,7 +98,6 @@ DeviceManager::~DeviceManager() for (auto q: mQueues) delete q; for (auto d : mDevices) delete d; for (auto c : mContexts) delete c; - for (auto p : mPlatforms) delete p; #endif } @@ -123,9 +122,6 @@ DeviceManager::DeviceManager() #endif }; - for (auto &platform : platforms) - mPlatforms.push_back(new Platform(platform)); - unsigned nDevices = 0; for (auto devType : DEVC_TYPES) { for (auto &platform : platforms) { @@ -150,7 +146,6 @@ DeviceManager::DeviceManager() mDevices.push_back(new Device(dev)); mContexts.push_back(ctx); mQueues.push_back(cq); - mCtxOffsets.push_back(nDevices); mIsGLSharingOn.push_back(false); } } diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 90f57aed39..7f0dab6f94 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -70,8 +70,6 @@ class DeviceManager std::vector mQueues; std::vector mDevices; std::vector mContexts; - std::vector mPlatforms; - std::vector mCtxOffsets; std::vector mIsGLSharingOn; unsigned mActiveCtxId; From ce2d6a6c5d2fa20478f701aedd31ea291f3356f6 Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 3 Dec 2015 16:03:09 -0500 Subject: [PATCH 047/288] Enables use of ArrayFire with external context & command queue --- include/af/opencl.h | 47 +++++++++++++++ src/backend/opencl/platform.cpp | 103 +++++++++++++++++++++++++++++++- src/backend/opencl/platform.hpp | 17 +++++- 3 files changed, 164 insertions(+), 3 deletions(-) diff --git a/include/af/opencl.h b/include/af/opencl.h index 271879fdc9..99080a518e 100644 --- a/include/af/opencl.h +++ b/include/af/opencl.h @@ -63,6 +63,53 @@ AFAPI af_err afcl_get_device_id(cl_device_id *id); AFAPI af_err afcl_set_device_id(cl_device_id id); #endif +#if AF_API_VERSION >= 33 +/** + Push user provided device control constructs into the ArrayFire device manager pool + + This function should be used only when the user would like ArrayFire to use an + user generated OpenCL context and related objects for ArrayFire operations. + + \param[in] dev is the OpenCL device for which user provided context will be used by ArrayFire + \param[in] ctx is the user provided OpenCL cl_context to be used by ArrayFire + \param[in] que is the user provided OpenCL cl_command_queue to be used by ArrayFire. If this + parameter is NULL, then we create a command queue for the user using the OpenCL + context they provided us. + + \note The cl_* objects are passed onto c++ objects (cl::Device, cl::Context & cl::CommandQueue) + that are defined in the `cl.hpp` OpenCL c++ header provided by Khronos Group Inc. Therefore, please + be aware of the lifetime of the cl_* objects before passing them to ArrayFire. +*/ +AFAPI af_err afcl_push_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que); +#endif + +#if AF_API_VERSION >= 33 +/** + Set active device using cl_context and cl_device_id + + \param[in] dev is the OpenCL device id that is to be set as Active device inside ArrayFire + \param[in] ctx is the OpenCL cl_context being used by ArrayFire +*/ +AFAPI af_err afcl_set_device_context(cl_device_id dev, cl_context ctx); +#endif + +#if AF_API_VERSION >= 33 +/** + Remove the user provided device control constructs from the ArrayFire device manager pool + + This function should be used only when the user would like ArrayFire to remove an already + pushed user generated OpenCL context and related objects. + + \param[in] dev is the OpenCL device id that has to be popped + \param[in] ctx is the cl_context object to be removed from ArrayFire pool + + \note Any reference counts incremented for cl_* objects by ArrayFire internally are decremented + by this func call and you won't be able to call `afcl_set_device_context` on these objects after + this function has been called. +*/ +AFAPI af_err afcl_pop_device_context(cl_device_id dev, cl_context ctx); +#endif + /** @} */ diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 6f9ae99116..32ba72d27b 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -108,7 +108,7 @@ void DeviceManager::setContext(int device) } DeviceManager::DeviceManager() - : mActiveCtxId(0), mActiveQId(0) + : mUserDeviceOffset(0), mActiveCtxId(0), mActiveQId(0) { try { std::vector platforms; @@ -181,6 +181,7 @@ DeviceManager::DeviceManager() } } #endif + mUserDeviceOffset = mDevices.size(); } @@ -472,6 +473,88 @@ void DeviceManager::markDeviceForInterop(const int device, const fg::Window* wHa } #endif +void pushDeviceContext(cl_device_id dev, cl_context ctx, cl_command_queue que) +{ + try { + DeviceManager& devMngr = DeviceManager::getInstance(); + cl::Device* tDevice = new cl::Device(dev); + cl::Context* tContext = new cl::Context(ctx); + cl::CommandQueue* tQueue = (que==NULL ? + new cl::CommandQueue(*tContext, *tDevice) : new cl::CommandQueue(que)); + devMngr.mDevices.push_back(tDevice); + devMngr.mContexts.push_back(tContext); + devMngr.mQueues.push_back(tQueue); + // FIXME: add OpenGL Interop for user provided contexts later + devMngr.mIsGLSharingOn.push_back(false); + } catch (const cl::Error &ex) { + CL_TO_AF_ERROR(ex); + } +} + +void setDeviceContext(cl_device_id dev, cl_context ctx) +{ + // FIXME: add OpenGL Interop for user provided contexts later + try { + DeviceManager& devMngr = DeviceManager::getInstance(); + const int dCount = devMngr.mDevices.size(); + for (int i=0; ioperator()()==dev && + devMngr.mContexts[i]->operator()()==ctx) { + setDevice(i); + return; + } + } + } catch (const cl::Error &ex) { + CL_TO_AF_ERROR(ex); + } + AF_ERROR("No matching device found", AF_ERR_ARG); +} + +void popDeviceContext(cl_device_id dev, cl_context ctx) +{ + try { + if (getDevice()() == dev && getContext()()==ctx) { + AF_ERROR("Cannot pop the device currently in use", AF_ERR_ARG); + } + + DeviceManager& devMngr = DeviceManager::getInstance(); + const int dCount = devMngr.mDevices.size(); + int deleteIdx = -1; + for (int i = 0; ioperator()()==dev && + devMngr.mContexts[i]->operator()()==ctx) { + deleteIdx = i; + break; + } + } + if (deleteIdx < (int)devMngr.mUserDeviceOffset) { + AF_ERROR("Cannot pop ArrayFire internal devices", AF_ERR_ARG); + } else if (deleteIdx == -1) { + AF_ERROR("No matching device found", AF_ERR_ARG); + } else { + // FIXME: this case can potentially cause issues due to the + // modification of the device pool stl containers. + + // IF the current active device is enumerated at a position + // that lies ahead of the device that has been requested + // to be removed. We just pop the entries from pool since it + // has no side effects. + devMngr.mDevices.erase(devMngr.mDevices.begin()+deleteIdx); + devMngr.mContexts.erase(devMngr.mContexts.begin()+deleteIdx); + devMngr.mQueues.erase(devMngr.mQueues.begin()+deleteIdx); + // FIXME: add OpenGL Interop for user provided contexts later + devMngr.mIsGLSharingOn.erase(devMngr.mIsGLSharingOn.begin()+deleteIdx); + // OTHERWISE, update(decrement) the `mActive*Id` variables + if (deleteIdx < (int)devMngr.mActiveCtxId) { + --devMngr.mActiveCtxId; + --devMngr.mActiveQId; + } + } + } catch (const cl::Error &ex) { + CL_TO_AF_ERROR(ex); + } +} + } using namespace opencl; @@ -502,3 +585,21 @@ af_err afcl_set_device_id(cl_device_id id) setDevice(getDeviceIdFromNativeId(id)); return AF_SUCCESS; } + +af_err afcl_push_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que) +{ + pushDeviceContext(dev, ctx, que); + return AF_SUCCESS; +} + +af_err afcl_set_device_context(cl_device_id dev, cl_context ctx) +{ + setDeviceContext(dev, ctx); + return AF_SUCCESS; +} + +af_err afcl_pop_device_context(cl_device_id dev, cl_context ctx) +{ + popDeviceContext(dev, ctx); + return AF_SUCCESS; +} diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 7f0dab6f94..022cd7e52d 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -43,8 +43,14 @@ class DeviceManager friend int setDevice(int device); + friend void pushDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que); + + friend void setDeviceContext(cl_device_id dev, cl_context cxt); + + friend void popDeviceContext(cl_device_id dev, cl_context ctx); + public: - static const unsigned MAX_DEVICES = 16; + static const unsigned MAX_DEVICES = 32; static DeviceManager& getInstance(); @@ -67,10 +73,11 @@ class DeviceManager private: // Attributes - std::vector mQueues; std::vector mDevices; std::vector mContexts; + std::vector mQueues; std::vector mIsGLSharingOn; + unsigned mUserDeviceOffset; unsigned mActiveCtxId; unsigned mActiveQId; @@ -100,6 +107,12 @@ std::string getPlatformName(const cl::Device &device); int setDevice(int device); +void pushDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que); + +void setDeviceContext(cl_device_id dev, cl_context cxt); + +void popDeviceContext(cl_device_id dev, cl_context ctx); + void sync(int device); } From f65ee89baf9270bdd1b85c8317c87820f283d57f Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 4 Dec 2015 14:19:54 -0500 Subject: [PATCH 048/288] cpp wrappers for opencl external context related fns --- include/af/opencl.h | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/include/af/opencl.h b/include/af/opencl.h index 99080a518e..0aa8981eeb 100644 --- a/include/af/opencl.h +++ b/include/af/opencl.h @@ -194,6 +194,61 @@ namespace afcl } #endif +#if AF_API_VERSION >= 33 +/** + Push user provided device control constructs into the ArrayFire device manager pool + + This function should be used only when the user would like ArrayFire to use an + user generated OpenCL context and related objects for ArrayFire operations. + + \param[in] dev is the OpenCL device for which user provided context will be used by ArrayFire + \param[in] ctx is the user provided OpenCL cl_context to be used by ArrayFire + \param[in] que is the user provided OpenCL cl_command_queue to be used by ArrayFire. If this + parameter is NULL, then we create a command queue for the user using the OpenCL + context they provided us. + + \note The cl_* objects are passed onto c++ objects (cl::Device, cl::Context & cl::CommandQueue) + that are defined in the `cl.hpp` OpenCL c++ header provided by Khronos Group Inc. Therefore, please + be aware of the lifetime of the cl_* objects before passing them to ArrayFire. +*/ +static inline void pushDevice(cl_device_id dev, cl_context ctx, cl_command_queue que) +{ + af_err err = afcl_push_device_context(dev, ctx, que); + if (err!=AF_SUCCESS) throw af::exception("Failed to push user provided device/context to ArrayFire pool"); +} + +/** + Set active device using cl_context and cl_device_id + + \param[in] dev is the OpenCL device id that is to be set as Active device inside ArrayFire + \param[in] ctx is the OpenCL cl_context being used by ArrayFire +*/ +static inline void setDevice(cl_device_id dev, cl_context ctx) +{ + af_err err = afcl_set_device_context(dev, ctx); + if (err!=AF_SUCCESS) throw af::exception("Failed to set device based on cl_device_id & cl_context"); +} + +/** + Remove the user provided device control constructs from the ArrayFire device manager pool + + This function should be used only when the user would like ArrayFire to remove an already + pushed user generated OpenCL context and related objects. + + \param[in] dev is the OpenCL device id that has to be popped + \param[in] ctx is the cl_context object to be removed from ArrayFire pool + + \note Any reference counts incremented for cl_* objects by ArrayFire internally are decremented + by this func call and you won't be able to call `afcl_set_device_context` on these objects after + this function has been called. +*/ +static inline void popDevice(cl_device_id dev, cl_context ctx) +{ + af_err err = afcl_pop_device_context(dev, ctx); + if (err!=AF_SUCCESS) throw af::exception("Failed to remove the requested device from ArrayFire device pool"); +} +#endif + /** Create an af::array object from an OpenCL cl_mem buffer From 2bcc6de2932d9070863ee26b6a762020caef90eb Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 4 Dec 2015 14:20:27 -0500 Subject: [PATCH 049/288] unit tests for afcl::{pushDevice, setDevice, popDevice} fns --- test/ocl_ext_context.cpp | 112 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 test/ocl_ext_context.cpp diff --git a/test/ocl_ext_context.cpp b/test/ocl_ext_context.cpp new file mode 100644 index 0000000000..6b9b48086e --- /dev/null +++ b/test/ocl_ext_context.cpp @@ -0,0 +1,112 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#if defined(AF_OPENCL) +#include +#include + +using namespace std; + +inline void checkErr(cl_int err, const char * name) { + if (err != CL_SUCCESS) { + std::cerr << "ERROR: " << name << " (" << err << ")" << std::endl; + exit(EXIT_FAILURE); + } +} + +void getExternals(cl_device_id &deviceId, cl_context &context, cl_command_queue &queue) +{ + static cl_device_id dId = NULL; + static cl_context cId = NULL; + static cl_command_queue qId = NULL; + static bool call_once = true; + + if (call_once) { + cl_platform_id platformId = NULL; + cl_uint numPlatforms; + cl_uint numDevices; + cl_int errorCode = 0; + + checkErr(clGetPlatformIDs(1, &platformId, &numPlatforms), + "Get Platforms failed"); + + checkErr(clGetDeviceIDs(platformId, CL_DEVICE_TYPE_DEFAULT, 1, &dId, &numDevices), + "Get cl_device_id failed"); + + cId = clCreateContext(NULL, 1, &dId, NULL, NULL, &errorCode); + checkErr(errorCode, "Context creation failed"); + + qId = clCreateCommandQueue(cId, dId, 0, &errorCode); + checkErr(errorCode, "Command queue creation failed"); + call_once = false; + } + deviceId = dId; + context = cId; + queue = qId; +} + +TEST(OCLExtContext, push) +{ + cl_device_id deviceId = NULL; + cl_context context = NULL; + cl_command_queue queue = NULL; + + getExternals(deviceId, context, queue); + int dCount = af::getDeviceCount(); + printf("%d devices before afcl::pushDevice\n", dCount); + af::info(); + afcl::pushDevice(deviceId, context, queue); + ASSERT_EQ(true, dCount+1==af::getDeviceCount()); + printf("%d devices after afcl::pushDevice\n", af::getDeviceCount()); + af::info(); +} + +TEST(OCLExtContext, set) +{ + cl_device_id deviceId = NULL; + cl_context context = NULL; + cl_command_queue queue = NULL; + + getExternals(deviceId, context, queue); + afcl::setDevice(deviceId, context); + af::info(); + + const int x = 5; + const int y = 5; + const int s = x * y; + af::array a = af::constant(1, x, y); + vector host(s); + a.host((void*)host.data()); + for (int i=0; i Date: Fri, 4 Dec 2015 14:34:32 -0500 Subject: [PATCH 050/288] Style changes in opencl header --- include/af/opencl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/af/opencl.h b/include/af/opencl.h index 0aa8981eeb..6a811422db 100644 --- a/include/af/opencl.h +++ b/include/af/opencl.h @@ -216,7 +216,9 @@ static inline void pushDevice(cl_device_id dev, cl_context ctx, cl_command_queue af_err err = afcl_push_device_context(dev, ctx, que); if (err!=AF_SUCCESS) throw af::exception("Failed to push user provided device/context to ArrayFire pool"); } +#endif +#if AF_API_VERSION >= 33 /** Set active device using cl_context and cl_device_id @@ -228,7 +230,9 @@ static inline void setDevice(cl_device_id dev, cl_context ctx) af_err err = afcl_set_device_context(dev, ctx); if (err!=AF_SUCCESS) throw af::exception("Failed to set device based on cl_device_id & cl_context"); } +#endif +#if AF_API_VERSION >= 33 /** Remove the user provided device control constructs from the ArrayFire device manager pool From d41839f93a6177eff294192b12b37f52232fe6ff Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 4 Dec 2015 15:39:57 -0500 Subject: [PATCH 051/288] api name change for afcl external context functionality --- include/af/opencl.h | 12 ++++++------ src/backend/opencl/platform.cpp | 12 ++++++------ src/backend/opencl/platform.hpp | 8 ++++---- test/ocl_ext_context.cpp | 12 ++++++------ 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/af/opencl.h b/include/af/opencl.h index 6a811422db..88e47d2b16 100644 --- a/include/af/opencl.h +++ b/include/af/opencl.h @@ -80,7 +80,7 @@ AFAPI af_err afcl_set_device_id(cl_device_id id); that are defined in the `cl.hpp` OpenCL c++ header provided by Khronos Group Inc. Therefore, please be aware of the lifetime of the cl_* objects before passing them to ArrayFire. */ -AFAPI af_err afcl_push_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que); +AFAPI af_err afcl_add_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que); #endif #if AF_API_VERSION >= 33 @@ -107,7 +107,7 @@ AFAPI af_err afcl_set_device_context(cl_device_id dev, cl_context ctx); by this func call and you won't be able to call `afcl_set_device_context` on these objects after this function has been called. */ -AFAPI af_err afcl_pop_device_context(cl_device_id dev, cl_context ctx); +AFAPI af_err afcl_delete_device_context(cl_device_id dev, cl_context ctx); #endif /** @@ -211,9 +211,9 @@ namespace afcl that are defined in the `cl.hpp` OpenCL c++ header provided by Khronos Group Inc. Therefore, please be aware of the lifetime of the cl_* objects before passing them to ArrayFire. */ -static inline void pushDevice(cl_device_id dev, cl_context ctx, cl_command_queue que) +static inline void addDevice(cl_device_id dev, cl_context ctx, cl_command_queue que) { - af_err err = afcl_push_device_context(dev, ctx, que); + af_err err = afcl_add_device_context(dev, ctx, que); if (err!=AF_SUCCESS) throw af::exception("Failed to push user provided device/context to ArrayFire pool"); } #endif @@ -246,9 +246,9 @@ static inline void setDevice(cl_device_id dev, cl_context ctx) by this func call and you won't be able to call `afcl_set_device_context` on these objects after this function has been called. */ -static inline void popDevice(cl_device_id dev, cl_context ctx) +static inline void deleteDevice(cl_device_id dev, cl_context ctx) { - af_err err = afcl_pop_device_context(dev, ctx); + af_err err = afcl_delete_device_context(dev, ctx); if (err!=AF_SUCCESS) throw af::exception("Failed to remove the requested device from ArrayFire device pool"); } #endif diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 32ba72d27b..510cb50e48 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -473,7 +473,7 @@ void DeviceManager::markDeviceForInterop(const int device, const fg::Window* wHa } #endif -void pushDeviceContext(cl_device_id dev, cl_context ctx, cl_command_queue que) +void addDeviceContext(cl_device_id dev, cl_context ctx, cl_command_queue que) { try { DeviceManager& devMngr = DeviceManager::getInstance(); @@ -510,7 +510,7 @@ void setDeviceContext(cl_device_id dev, cl_context ctx) AF_ERROR("No matching device found", AF_ERR_ARG); } -void popDeviceContext(cl_device_id dev, cl_context ctx) +void removeDeviceContext(cl_device_id dev, cl_context ctx) { try { if (getDevice()() == dev && getContext()()==ctx) { @@ -586,9 +586,9 @@ af_err afcl_set_device_id(cl_device_id id) return AF_SUCCESS; } -af_err afcl_push_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que) +af_err afcl_add_device_context(cl_device_id dev, cl_context ctx, cl_command_queue que) { - pushDeviceContext(dev, ctx, que); + addDeviceContext(dev, ctx, que); return AF_SUCCESS; } @@ -598,8 +598,8 @@ af_err afcl_set_device_context(cl_device_id dev, cl_context ctx) return AF_SUCCESS; } -af_err afcl_pop_device_context(cl_device_id dev, cl_context ctx) +af_err afcl_delete_device_context(cl_device_id dev, cl_context ctx) { - popDeviceContext(dev, ctx); + removeDeviceContext(dev, ctx); return AF_SUCCESS; } diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 022cd7e52d..154d84bc8e 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -43,11 +43,11 @@ class DeviceManager friend int setDevice(int device); - friend void pushDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que); + friend void addDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que); friend void setDeviceContext(cl_device_id dev, cl_context cxt); - friend void popDeviceContext(cl_device_id dev, cl_context ctx); + friend void removeDeviceContext(cl_device_id dev, cl_context ctx); public: static const unsigned MAX_DEVICES = 32; @@ -107,11 +107,11 @@ std::string getPlatformName(const cl::Device &device); int setDevice(int device); -void pushDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que); +void addDeviceContext(cl_device_id dev, cl_context cxt, cl_command_queue que); void setDeviceContext(cl_device_id dev, cl_context cxt); -void popDeviceContext(cl_device_id dev, cl_context ctx); +void removeDeviceContext(cl_device_id dev, cl_context ctx); void sync(int device); diff --git a/test/ocl_ext_context.cpp b/test/ocl_ext_context.cpp index 6b9b48086e..0d4f89b3fc 100644 --- a/test/ocl_ext_context.cpp +++ b/test/ocl_ext_context.cpp @@ -61,11 +61,11 @@ TEST(OCLExtContext, push) getExternals(deviceId, context, queue); int dCount = af::getDeviceCount(); - printf("%d devices before afcl::pushDevice\n", dCount); + printf("%d devices before afcl::addDevice\n", dCount); af::info(); - afcl::pushDevice(deviceId, context, queue); + afcl::addDevice(deviceId, context, queue); ASSERT_EQ(true, dCount+1==af::getDeviceCount()); - printf("%d devices after afcl::pushDevice\n", af::getDeviceCount()); + printf("%d devices after afcl::addDevice\n", af::getDeviceCount()); af::info(); } @@ -97,12 +97,12 @@ TEST(OCLExtContext, pop) getExternals(deviceId, context, queue); int dCount = af::getDeviceCount(); - printf("%d devices before afcl::popDevice\n", dCount); + printf("%d devices before afcl::deleteDevice\n", dCount); af::setDevice(0); af::info(); - afcl::popDevice(deviceId, context); + afcl::deleteDevice(deviceId, context); ASSERT_EQ(true, dCount-1==af::getDeviceCount()); - printf("%d devices after afcl::popDevice\n", af::getDeviceCount()); + printf("%d devices after afcl::deleteDevice\n", af::getDeviceCount()); af::info(); } #else From 227377d5d891b742ff181fa108b75ad5c5b1fce7 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 4 Dec 2015 16:12:51 -0500 Subject: [PATCH 052/288] Added OpenCL include dir for unit tests This is required by the ocl_ext_context unit tests source file --- test/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 30907d3390..b1eb5521b3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -81,7 +81,10 @@ ELSE(USE_SYSTEM_GTEST) INCLUDE("${CMAKE_MODULE_PATH}/build_gtest.cmake") ENDIF(USE_SYSTEM_GTEST) -INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS}) +INCLUDE_DIRECTORIES( + ${GTEST_INCLUDE_DIRS} + ${OpenCL_INCLUDE_DIRS} + ) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) FILE(GLOB FILES "*.cpp" "*.c") From 67ef0517d77bf5a367f22e2819f9091f7f2f0b94 Mon Sep 17 00:00:00 2001 From: Pradeep Date: Fri, 4 Dec 2015 16:47:28 -0500 Subject: [PATCH 053/288] additional style changes --- test/CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b1eb5521b3..3ae6ec07bd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -81,10 +81,7 @@ ELSE(USE_SYSTEM_GTEST) INCLUDE("${CMAKE_MODULE_PATH}/build_gtest.cmake") ENDIF(USE_SYSTEM_GTEST) -INCLUDE_DIRECTORIES( - ${GTEST_INCLUDE_DIRS} - ${OpenCL_INCLUDE_DIRS} - ) +INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) FILE(GLOB FILES "*.cpp" "*.c") @@ -115,6 +112,7 @@ ENDIF() IF(${BUILD_OPENCL} AND ${OpenCL_FOUND}) MESSAGE(STATUS "TESTS: OPENCL backend is ON") + INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIRS}) CREATE_TESTS(opencl opencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") ELSE() MESSAGE(STATUS "TESTS: OPENCL backend is OFF") From 365dc949deee9818a152f8c26ee8eab4baf17a15 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Sat, 5 Dec 2015 15:32:44 -0500 Subject: [PATCH 054/288] DOC resolve markerType enum in graphics --- include/af/graphics.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/af/graphics.h b/include/af/graphics.h index eeb3f09371..600d48a0ea 100644 --- a/include/af/graphics.h +++ b/include/af/graphics.h @@ -408,7 +408,7 @@ AFAPI af_err af_draw_plot(const af_window wind, const af_array X, const af_array \param[in] wind is the window handle \param[in] X is an \ref af_array with the x-axis data points \param[in] Y is an \ref af_array with the y-axis data points - \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot + \param[in] marker is an \ref af_marker_type enum specifying which marker to use in the scatter plot \param[in] props is structure \ref af_cell that has the properties that are used for the current rendering. @@ -429,7 +429,7 @@ AFAPI af_err af_draw_scatter(const af_window wind, const af_array X, const af_ar \param[in] wind is the window handle \param[in] P is an \ref af_array or matrix with the xyz-values of the points - \param[in] marker is an \ref markerType enum specifying which marker to use in the scatter plot + \param[in] marker is an \ref af_marker_type enum specifying which marker to use in the scatter plot \param[in] props is structure \ref af_cell that has the properties that are used for the current rendering. From b878711223123de44a620019b43926e96a04b479 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Sat, 5 Dec 2015 16:28:38 -0500 Subject: [PATCH 055/288] Remove unused variable warning in homography cuda kernel --- src/backend/cuda/kernel/homography.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/cuda/kernel/homography.hpp b/src/backend/cuda/kernel/homography.hpp index 65d880e59b..90cc3ce46d 100644 --- a/src/backend/cuda/kernel/homography.hpp +++ b/src/backend/cuda/kernel/homography.hpp @@ -64,7 +64,7 @@ __device__ void JacobiSVD(int m, int n) int tid_x = threadIdx.x; int bsz_x = blockDim.x; int tid_y = threadIdx.y; - int gid_y = blockIdx.y * blockDim.y + tid_y; + //int gid_y = blockIdx.y * blockDim.y + tid_y; __shared__ T acc1[256]; __shared__ T acc2[256]; From cad4c2c67c2d34777155a1007f5309cde904a983 Mon Sep 17 00:00:00 2001 From: syurkevi Date: Wed, 9 Dec 2015 02:06:35 -0500 Subject: [PATCH 056/288] initial gravity example --- examples/graphics/gravity_sim.cpp | 126 ++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 examples/graphics/gravity_sim.cpp diff --git a/examples/graphics/gravity_sim.cpp b/examples/graphics/gravity_sim.cpp new file mode 100644 index 0000000000..77f662f4db --- /dev/null +++ b/examples/graphics/gravity_sim.cpp @@ -0,0 +1,126 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include + +using namespace af; +using namespace std; + +static const int width = 512, height = 512; + + +void simulate(af::array &parts, af::array &vels, af::array &forces){ + parts += vels; + + //calculate distance to center + float center_coors[2] = { width / 2, height / 2 }; + af::array col = tile(af::array(1, 2, center_coors), parts.dims(0)); + af::array diff = parts - col; + af::array dist = sqrt( diff.col(0)*diff.col(0) + diff.col(1)*diff.col(1) ); + + forces = -1 * diff; + forces.col(0) /= dist; //normalize force vectors + forces.col(1) /= dist; //normalize force vectors + + //update velocities from forces + vels += forces; + +} + +void collisions(af::array &parts, af::array &vels){ + //clamp particles inside screen border + parts.col(0) = min(width, max(0, parts.col(0))); + parts.col(1) = min(height - 1, max(0, parts.col(1))); + + //calculate distance to center + float center_coors[2] = { width / 2, height / 2 }; + af::array col = tile(af::array(1, 2, center_coors), parts.dims(0)); + af::array diff = parts - col; + af::array dist = sqrt( diff.col(0)*diff.col(0) + diff.col(1)*diff.col(1) ); + + /* + //collide with center sphere + int radius = 50; + af::array col_ids = dist(dist 0) { + //vels(col_ids, span) += -1 * parts(col_ids, span); + vels(col_ids, span) = 0; + } + */ + +} + +int main(int argc, char *argv[]) +{ + try { + const static int total_particles=200; + static const int reset = 500; + + af::info(); + + af::Window myWindow(width, height, "Gravity Simulation using ArrayFire"); + + int frame_count = 0; + + // Initialize the kernel array just once + const af::array draw_kernel = gaussianKernel(3, 3); + + // Generate a random starting state + af::array particles = af::randu(total_particles,2); + particles.col(0) *= width; + particles.col(1) *= height; + + af::array velocities = af::randn(total_particles, 2); + af::array forces = af::randn(total_particles, 2); + + af::array image = af::constant(0, width, height); + af::array ids(total_particles, u32); + + while(!myWindow.close()) { + + ids = (particles.col(0).as(u32) * height) + particles.col(1).as(u32); + image(ids) += 255; + image = convolve2(image, draw_kernel); + myWindow.image(image); + image(span, span) = 0; + frame_count++; + + // Generate a random starting state + if(frame_count % reset == 0) { + particles = af::randu(total_particles,2); + particles.col(0) *= width; + particles.col(1) *= height; + + velocities = af::randn(total_particles, 2); + } + + //run force simulation and update particles + simulate(particles, velocities, forces); + + //check for collisions and adjust velocities accordingly + collisions(particles, velocities); + + } + } catch (af::exception& e) { + fprintf(stderr, "%s\n", e.what()); + throw; + } + + #ifdef WIN32 // pause in Windows + if (!(argc == 2 && argv[1][0] == '-')) { + printf("hit [enter]..."); + fflush(stdout); + getchar(); + } + #endif + return 0; +} + From df2c09186386058aef56dc79ee06bd7103bdbae5 Mon Sep 17 00:00:00 2001 From: pradeep Date: Tue, 8 Dec 2015 15:03:38 -0500 Subject: [PATCH 057/288] Converted iir, fir, fftconvolve to async calls Added eval, sync statements to orb, fast to make them work with their asynchronous counter parts. Currently, one test of ORB is failing. Will fix it later. --- src/backend/cpu/convolve.cpp | 7 + src/backend/cpu/fast.cpp | 7 +- src/backend/cpu/fftconvolve.cpp | 280 ++++++++++++++++++-------------- src/backend/cpu/iir.cpp | 54 +++--- src/backend/cpu/orb.cpp | 18 +- 5 files changed, 217 insertions(+), 149 deletions(-) diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp index e7533764c1..239b4f0924 100644 --- a/src/backend/cpu/convolve.cpp +++ b/src/backend/cpu/convolve.cpp @@ -183,6 +183,9 @@ void convolve_nd(T *optr, T const *iptr, accT const *fptr, template Array convolve(Array const& signal, Array const& filter, ConvolveBatchKind kind) { + signal.eval(); + filter.eval(); + auto sDims = signal.dims(); auto fDims = filter.dims(); auto sStrides = signal.strides(); @@ -255,6 +258,10 @@ void convolve2_separable(T *optr, T const *iptr, accT const *fptr, template Array convolve2(Array const& signal, Array const& c_filter, Array const& r_filter) { + signal.eval(); + c_filter.eval(); + r_filter.eval(); + auto sDims = signal.dims(); auto cfDims = c_filter.dims(); auto rfDims = r_filter.dims(); diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp index 1c8069c24d..c8b0514610 100644 --- a/src/backend/cpu/fast.cpp +++ b/src/backend/cpu/fast.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include using af::dim4; @@ -248,6 +250,9 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, const bool nonmax, const float feature_ratio, const unsigned edge) { + in.eval(); + getQueue().sync(); + dim4 in_dims = in.dims(); const unsigned max_feat = ceil(in.elements() * feature_ratio); @@ -257,6 +262,7 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, if (nonmax == 1) { dim4 V_dims(in_dims[0], in_dims[1]); V = createValueArray(V_dims, (float)0); + V.eval(); } // Arrays containing all features detected before non-maximal suppression. @@ -282,7 +288,6 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, Array score_total = createEmptyArray(af::dim4()); if (nonmax == 1) { - x_total = createEmptyArray(feat_found_dims); y_total = createEmptyArray(feat_found_dims); score_total = createEmptyArray(feat_found_dims); diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp index f76f3a0d3f..6172af86a6 100644 --- a/src/backend/cpu/fftconvolve.cpp +++ b/src/backend/cpu/fftconvolve.cpp @@ -17,14 +17,17 @@ #include #include #include +#include +#include namespace cpu { template -void packData(To* out_ptr, const af::dim4& od, const af::dim4& os, - Array const& in) +void packData(Array out, const af::dim4 od, const af::dim4 os, Array const in) { + To* out_ptr = out.get(); + const af::dim4 id = in.dims(); const af::dim4 is = in.strides(); const Ti* in_ptr = in.get(); @@ -58,9 +61,10 @@ void packData(To* out_ptr, const af::dim4& od, const af::dim4& os, } template -void padArray(To* out_ptr, const af::dim4& od, const af::dim4& os, - Array const& in) +void padArray_(Array out, const af::dim4 od, const af::dim4 os, + Array const in, const dim_t offset) { + To* out_ptr = out.get() + offset; const af::dim4 id = in.dims(); const af::dim4 is = in.strides(); const Ti* in_ptr = in.get(); @@ -89,11 +93,21 @@ void padArray(To* out_ptr, const af::dim4& od, const af::dim4& os, } template -void complexMultiply(T* out_ptr, const af::dim4& od, const af::dim4& os, - T* in1_ptr, const af::dim4& i1d, const af::dim4& i1s, - T* in2_ptr, const af::dim4& i2d, const af::dim4& i2s, - ConvolveBatchKind kind) +void complexMultiply(Array packed, const af::dim4 sig_dims, const af::dim4 sig_strides, + const af::dim4 fit_dims, const af::dim4 fit_strides, + ConvolveBatchKind kind, const dim_t offset) { + T* out_ptr = packed.get() + (kind==CONVOLVE_BATCH_KERNEL? offset : 0); + T* in1_ptr = packed.get(); + T* in2_ptr = packed.get() + offset; + + const dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims); + const dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides); + const dim4& i1d = sig_dims; + const dim4& i2d = fit_dims; + const dim4& i1s = sig_strides; + const dim4& i2s = fit_strides; + for (int d3 = 0; d3 < (int)od[3]; d3++) { for (int d2 = 0; d2 < (int)od[2]; d2++) { for (int d1 = 0; d1 < (int)od[1]; d1++) { @@ -219,6 +233,9 @@ template fftconvolve(Array const& signal, Array const& filter, const bool expand, ConvolveBatchKind kind) { + signal.eval(); + filter.eval(); + const af::dim4 sd = signal.dims(); const af::dim4 fd = filter.dims(); @@ -249,9 +266,6 @@ Array fftconvolve(Array const& signal, Array const& filter, packed_dims[baseDim] = (sbatch + fbatch); Array packed = createEmptyArray(packed_dims); - convT *packed_ptr = packed.get(); - - const af::dim4 packed_strides = packed.strides(); sig_tmp_dims[0] = filter_tmp_dims[0] = packed_dims[0]; sig_tmp_strides[0] = filter_tmp_strides[0] = 1; @@ -270,107 +284,117 @@ Array fftconvolve(Array const& signal, Array const& filter, filter_tmp_strides[k] = filter_tmp_strides[k - 1] * filter_tmp_dims[k - 1]; } - // Calculate memory offsets for packed signal and filter - convT *sig_tmp_ptr = packed_ptr; - convT *filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3]; - // Number of packed complex elements in dimension 0 dim_t sig_half_d0 = divup(sd[0], 2); // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s - packData(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, signal); + getQueue().enqueue(packData, packed, sig_tmp_dims, sig_tmp_strides, signal); // Pad filter array with 0s - padArray(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, filter); - - // Compute forward FFT - if (isDouble) { - fftw_plan plan = fftw_plan_many_dft(baseDim, - fft_dims, - packed_dims[baseDim], - (fftw_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - (fftw_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - FFTW_FORWARD, - FFTW_ESTIMATE); - - fftw_execute(plan); - fftw_destroy_plan(plan); - } - else { - fftwf_plan plan = fftwf_plan_many_dft(baseDim, - fft_dims, - packed_dims[baseDim], - (fftwf_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - (fftwf_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - FFTW_FORWARD, - FFTW_ESTIMATE); - - fftwf_execute(plan); - fftwf_destroy_plan(plan); - } + const dim_t offset = sig_tmp_strides[3]*sig_tmp_dims[3]; + getQueue().enqueue(padArray_, packed, filter_tmp_dims, filter_tmp_strides, + filter, offset); + + dim4 fftDims(1, 1, 1, 1); + for (int i=0; i packed, const dim4 fftDims) { + int fft_dims[baseDim]; + for (int i=0; i(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - kind); - else - complexMultiply(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - kind); - - // Compute inverse FFT - if (isDouble) { - fftw_plan plan = fftw_plan_many_dft(baseDim, - fft_dims, - packed_dims[baseDim], - (fftw_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - (fftw_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - FFTW_BACKWARD, - FFTW_ESTIMATE); - - fftw_execute(plan); - fftw_destroy_plan(plan); - } - else { - fftwf_plan plan = fftwf_plan_many_dft(baseDim, - fft_dims, - packed_dims[baseDim], - (fftwf_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - (fftwf_complex*)packed.get(), - NULL, - packed_strides[0], - packed_strides[baseDim] / 2, - FFTW_BACKWARD, - FFTW_ESTIMATE); - - fftwf_execute(plan); - fftwf_destroy_plan(plan); - } + getQueue().enqueue(complexMultiply, packed, + sig_tmp_dims, sig_tmp_strides, + filter_tmp_dims, filter_tmp_strides, + kind, offset); + + auto upstream_idft = [=] (Array packed, const dim4 fftDims) { + int fft_dims[baseDim]; + for (int i=0; i fftconvolve(Array const& signal, Array const& filter, } Array out = createEmptyArray(oDims); - T* out_ptr = out.get(); - const af::dim4 out_dims = out.dims(); - const af::dim4 out_strides = out.strides(); - - const af::dim4 filter_dims = filter.dims(); - - // Reorder the output - if (kind == CONVOLVE_BATCH_KERNEL) { - reorderOutput - (out_ptr, out_dims, out_strides, - filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - filter_dims, sig_half_d0, baseDim, fftScale, expand); - } - else { - reorderOutput - (out_ptr, out_dims, out_strides, - sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - filter_dims, sig_half_d0, baseDim, fftScale, expand); - } + + auto reorderFunc = [=] (Array out, Array packed, + const Array filter, const dim_t sig_hald_d0, const dim_t fftScale, + const dim4 sig_tmp_dims, const dim4 sig_tmp_strides, + const dim4 filter_tmp_dims, const dim4 filter_tmp_strides) { + T* out_ptr = out.get(); + const af::dim4 out_dims = out.dims(); + const af::dim4 out_strides = out.strides(); + + const af::dim4 filter_dims = filter.dims(); + + convT* packed_ptr = packed.get(); + convT* sig_tmp_ptr = packed_ptr; + convT* filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3]; + + // Reorder the output + if (kind == CONVOLVE_BATCH_KERNEL) { + reorderOutput + (out_ptr, out_dims, out_strides, + filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, + filter_dims, sig_half_d0, baseDim, fftScale, expand); + } else { + reorderOutput + (out_ptr, out_dims, out_strides, + sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, + filter_dims, sig_half_d0, baseDim, fftScale, expand); + } + }; + getQueue().enqueue(reorderFunc, out, packed, filter, sig_half_d0, fftScale, + sig_tmp_dims, sig_tmp_strides, + filter_tmp_dims, filter_tmp_strides); return out; } diff --git a/src/backend/cpu/iir.cpp b/src/backend/cpu/iir.cpp index 615da2238d..3c06275f5a 100644 --- a/src/backend/cpu/iir.cpp +++ b/src/backend/cpu/iir.cpp @@ -16,32 +16,37 @@ #include #include #include +#include +#include using af::dim4; namespace cpu { - template - Array iir(const Array &b, const Array &a, const Array &x) - { - T h_a0 = a.get()[0]; - Array a0 = createValueArray(b.dims(), h_a0); - - ConvolveBatchKind type = x.ndims() == 1 ? CONVOLVE_BATCH_NONE : CONVOLVE_BATCH_SAME; - if (x.ndims() != b.ndims()) { - type = (x.ndims() < b.ndims()) ? CONVOLVE_BATCH_KERNEL : CONVOLVE_BATCH_SIGNAL; - } - // Extract the first N elements - Array c = convolve(x, b, type); - dim4 cdims = c.dims(); - cdims[0] = x.dims()[0]; - c.resetDims(cdims); +template +Array iir(const Array &b, const Array &a, const Array &x) +{ + b.eval(); + a.eval(); + x.eval(); - int num_a = a.dims()[0]; + ConvolveBatchKind type = x.ndims() == 1 ? CONVOLVE_BATCH_NONE : CONVOLVE_BATCH_SAME; + if (x.ndims() != b.ndims()) { + type = (x.ndims() < b.ndims()) ? CONVOLVE_BATCH_KERNEL : CONVOLVE_BATCH_SIGNAL; + } + + // Extract the first N elements + Array c = convolve(x, b, type); + dim4 cdims = c.dims(); + cdims[0] = x.dims()[0]; + c.resetDims(cdims); + Array y = createEmptyArray(c.dims()); + + auto func = [=] (Array y, Array c, const Array a) { dim4 ydims = c.dims(); - Array y = createEmptyArray(ydims); + int num_a = a.dims()[0]; for (int l = 0; l < (int)ydims[3]; l++) { dim_t yidx3 = l * y.strides()[3]; @@ -76,17 +81,20 @@ namespace cpu } } } + }; + getQueue().enqueue(func, y, c, a); - return y; - } + return y; +} #define INSTANTIATE(T) \ template Array iir(const Array &b, \ const Array &a, \ const Array &x); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) + } diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp index d279ba514f..4b6629cb3f 100644 --- a/src/backend/cpu/orb.cpp +++ b/src/backend/cpu/orb.cpp @@ -19,6 +19,8 @@ #include #include #include +#include +#include using af::dim4; @@ -542,6 +544,8 @@ unsigned orb(Array &x, Array &y, const float scl_fctr, const unsigned levels, const bool blur_img) { + image.eval(); + getQueue().sync(); unsigned patch_size = REF_PAT_SIZE; @@ -607,6 +611,8 @@ unsigned orb(Array &x, Array &y, ldims[1] = round(idims[1] / lvl_scl); lvl_img = resize(prev_img, ldims[0], ldims[1], AF_INTERP_BILINEAR); + lvl_img.eval(); + getQueue().sync(); prev_img = lvl_img; prev_ldims = lvl_img.dims(); @@ -627,7 +633,10 @@ unsigned orb(Array &x, Array &y, unsigned lvl_feat = fast(x_feat, y_feat, score_feat, lvl_img, fast_thr, 9, 1, 0.15f, edge); - + x_feat.eval(); + y_feat.eval(); + score_feat.eval(); + getQueue().sync(); if (lvl_feat == 0) { continue; @@ -653,7 +662,6 @@ unsigned orb(Array &x, Array &y, memFree(h_x_harris); memFree(h_y_harris); memFree(h_score_harris); - continue; } @@ -664,13 +672,15 @@ unsigned orb(Array &x, Array &y, Array harris_idx = createEmptyArray(af::dim4()); sort_index(harris_sorted, harris_idx, score_harris, 0); + harris_sorted.eval(); + harris_idx.eval(); + getQueue().sync(); usable_feat = std::min(usable_feat, lvl_best[i]); if (usable_feat == 0) { memFree(h_x_harris); memFree(h_y_harris); - continue; } @@ -706,6 +716,8 @@ unsigned orb(Array &x, Array &y, // Filter level image with Gaussian kernel to reduce noise sensitivity lvl_filt = convolve2(lvl_img, gauss_filter, gauss_filter); } + lvl_filt.eval(); + getQueue().sync(); // Compute ORB descriptors unsigned* h_desc_lvl = memAlloc(usable_feat * 8); From c06f24d585f67989629a0f7eff7c845334981531 Mon Sep 17 00:00:00 2001 From: pradeep Date: Wed, 9 Dec 2015 17:02:07 -0500 Subject: [PATCH 058/288] Change to gfx to handle Arrays created by async calls --- src/backend/cpu/hist_graphics.cpp | 4 +++ src/backend/cpu/image.cpp | 58 +++++++++++++++++-------------- src/backend/cpu/plot.cpp | 53 +++++++++++++++------------- src/backend/cpu/plot3.cpp | 53 +++++++++++++++------------- src/backend/cpu/surface.cpp | 53 +++++++++++++++------------- 5 files changed, 119 insertions(+), 102 deletions(-) diff --git a/src/backend/cpu/hist_graphics.cpp b/src/backend/cpu/hist_graphics.cpp index 21d3fdf941..56f7646b61 100644 --- a/src/backend/cpu/hist_graphics.cpp +++ b/src/backend/cpu/hist_graphics.cpp @@ -11,6 +11,8 @@ #include #include +#include +#include namespace cpu { @@ -18,6 +20,8 @@ namespace cpu template void copy_histogram(const Array &data, const fg::Histogram* hist) { + data.eval(); + getQueue().sync(); CheckGL("Begin copy_histogram"); glBindBuffer(GL_ARRAY_BUFFER, hist->vbo()); diff --git a/src/backend/cpu/image.cpp b/src/backend/cpu/image.cpp index 947afa2351..767f9d42f1 100644 --- a/src/backend/cpu/image.cpp +++ b/src/backend/cpu/image.cpp @@ -15,39 +15,43 @@ #include #include #include -#include -#include #include +#include +#include using af::dim4; namespace cpu { - template - void copy_image(const Array &in, const fg::Image* image) - { - CheckGL("Before CopyArrayToPBO"); - const T *d_X = in.get(); - size_t data_size = image->size(); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, image->pbo()); - glBufferSubData(GL_PIXEL_UNPACK_BUFFER, 0, data_size, d_X); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - CheckGL("In CopyArrayToPBO"); - } - - #define INSTANTIATE(T) \ - template void copy_image(const Array &in, const fg::Image* image); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(ushort) - INSTANTIATE(short) + +template +void copy_image(const Array &in, const fg::Image* image) +{ + in.eval(); + getQueue().sync(); + CheckGL("Before CopyArrayToPBO"); + const T *d_X = in.get(); + size_t data_size = image->size(); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, image->pbo()); + glBufferSubData(GL_PIXEL_UNPACK_BUFFER, 0, data_size, d_X); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + CheckGL("In CopyArrayToPBO"); +} + +#define INSTANTIATE(T) \ + template void copy_image(const Array &in, const fg::Image* image); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(ushort) +INSTANTIATE(short) + } #endif // WITH_GRAPHICS diff --git a/src/backend/cpu/plot.cpp b/src/backend/cpu/plot.cpp index 9de1993f2d..9cc7d9d2b9 100644 --- a/src/backend/cpu/plot.cpp +++ b/src/backend/cpu/plot.cpp @@ -12,37 +12,40 @@ #include #include #include -#include #include -#include -#include +#include +#include using af::dim4; namespace cpu { - template - void copy_plot(const Array &P, fg::Plot* plot) - { - CheckGL("Before CopyArrayToVBO"); - - glBindBuffer(GL_ARRAY_BUFFER, plot->vbo()); - glBufferSubData(GL_ARRAY_BUFFER, 0, plot->size(), P.get()); - glBindBuffer(GL_ARRAY_BUFFER, 0); - - CheckGL("In CopyArrayToVBO"); - } - - #define INSTANTIATE(T) \ - template void copy_plot(const Array &P, fg::Plot* plot); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) + +template +void copy_plot(const Array &P, fg::Plot* plot) +{ + P.eval(); + getQueue().sync(); + CheckGL("Before CopyArrayToVBO"); + + glBindBuffer(GL_ARRAY_BUFFER, plot->vbo()); + glBufferSubData(GL_ARRAY_BUFFER, 0, plot->size(), P.get()); + glBindBuffer(GL_ARRAY_BUFFER, 0); + + CheckGL("In CopyArrayToVBO"); +} + +#define INSTANTIATE(T) \ + template void copy_plot(const Array &P, fg::Plot* plot); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) + } #endif // WITH_GRAPHICS diff --git a/src/backend/cpu/plot3.cpp b/src/backend/cpu/plot3.cpp index c0e26aaa34..35a7b2500d 100644 --- a/src/backend/cpu/plot3.cpp +++ b/src/backend/cpu/plot3.cpp @@ -12,37 +12,40 @@ #include #include #include -#include #include -#include -#include +#include +#include using af::dim4; namespace cpu { - template - void copy_plot3(const Array &P, fg::Plot3* plot3) - { - CheckGL("Before CopyArrayToVBO"); - - glBindBuffer(GL_ARRAY_BUFFER, plot3->vbo()); - glBufferSubData(GL_ARRAY_BUFFER, 0, plot3->size(), P.get()); - glBindBuffer(GL_ARRAY_BUFFER, 0); - - CheckGL("In CopyArrayToVBO"); - } - - #define INSTANTIATE(T) \ - template void copy_plot3(const Array &P, fg::Plot3* plot3); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) + +template +void copy_plot3(const Array &P, fg::Plot3* plot3) +{ + P.eval(); + getQueue().sync(); + CheckGL("Before CopyArrayToVBO"); + + glBindBuffer(GL_ARRAY_BUFFER, plot3->vbo()); + glBufferSubData(GL_ARRAY_BUFFER, 0, plot3->size(), P.get()); + glBindBuffer(GL_ARRAY_BUFFER, 0); + + CheckGL("In CopyArrayToVBO"); +} + +#define INSTANTIATE(T) \ + template void copy_plot3(const Array &P, fg::Plot3* plot3); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) + } #endif // WITH_GRAPHICS diff --git a/src/backend/cpu/surface.cpp b/src/backend/cpu/surface.cpp index 39f375a6fe..116c784d89 100644 --- a/src/backend/cpu/surface.cpp +++ b/src/backend/cpu/surface.cpp @@ -12,37 +12,40 @@ #include #include #include -#include #include -#include -#include +#include +#include using af::dim4; namespace cpu { - template - void copy_surface(const Array &P, fg::Surface* surface) - { - CheckGL("Before CopyArrayToVBO"); - - glBindBuffer(GL_ARRAY_BUFFER, surface->vbo()); - glBufferSubData(GL_ARRAY_BUFFER, 0, surface->size(), P.get()); - glBindBuffer(GL_ARRAY_BUFFER, 0); - - CheckGL("In CopyArrayToVBO"); - } - - #define INSTANTIATE(T) \ - template void copy_surface(const Array &P, fg::Surface* surface); - - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) + +template +void copy_surface(const Array &P, fg::Surface* surface) +{ + P.eval(); + getQueue().sync(); + CheckGL("Before CopyArrayToVBO"); + + glBindBuffer(GL_ARRAY_BUFFER, surface->vbo()); + glBufferSubData(GL_ARRAY_BUFFER, 0, surface->size(), P.get()); + glBindBuffer(GL_ARRAY_BUFFER, 0); + + CheckGL("In CopyArrayToVBO"); +} + +#define INSTANTIATE(T) \ + template void copy_surface(const Array &P, fg::Surface* surface); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) + } #endif // WITH_GRAPHICS From 21f74eb706752c10901b3988cb709c865904cb72 Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 10 Dec 2015 13:47:01 -0500 Subject: [PATCH 059/288] Fixed harris & homography cpu fns to work with async fns --- src/backend/cpu/harris.cpp | 143 +++++++++++++++++---------------- src/backend/cpu/homography.cpp | 32 +++++--- 2 files changed, 91 insertions(+), 84 deletions(-) diff --git a/src/backend/cpu/harris.cpp b/src/backend/cpu/harris.cpp index d16c56a8b2..b57b94025d 100644 --- a/src/backend/cpu/harris.cpp +++ b/src/backend/cpu/harris.cpp @@ -19,6 +19,8 @@ #include #include #include +#include +#include using af::dim4; @@ -44,14 +46,14 @@ void gaussian1D(T* out, const int dim, double sigma=0.0) } template -void second_order_deriv( - T* ixx_out, - T* ixy_out, - T* iyy_out, - const unsigned in_len, - const T* ix_in, - const T* iy_in) +void second_order_deriv(Array ixx, Array ixy, Array iyy, + const unsigned in_len, const Array ix, const Array iy) { + T* ixx_out = ixx.get(); + T* ixy_out = ixy.get(); + T* iyy_out = iyy.get(); + const T* ix_in = ix.get(); + const T* iy_in = iy.get(); for (unsigned x = 0; x < in_len; x++) { ixx_out[x] = ix_in[x] * ix_in[x]; ixy_out[x] = ix_in[x] * iy_in[x]; @@ -60,16 +62,14 @@ void second_order_deriv( } template -void harris_responses( - T* resp_out, - const unsigned idim0, - const unsigned idim1, - const T* ixx_in, - const T* ixy_in, - const T* iyy_in, - const float k_thr, - const unsigned border_len) +void harris_responses(Array resp, const unsigned idim0, const unsigned idim1, + const Array ixx, const Array ixy, const Array iyy, + const float k_thr, const unsigned border_len) { + T* resp_out = resp.get(); + const T* ixx_in = ixx.get(); + const T* ixy_in = ixy.get(); + const T* iyy_in = iyy.get(); const unsigned r = border_len; for (unsigned x = r; x < idim1 - r; x++) { @@ -87,18 +87,14 @@ void harris_responses( } template -void non_maximal( - float* x_out, - float* y_out, - float* resp_out, - unsigned* count, - const unsigned idim0, - const unsigned idim1, - const T* resp_in, - const float min_resp, - const unsigned border_len, - const unsigned max_corners) +void non_maximal(Array xOut, Array yOut, Array respOut, unsigned* count, + const unsigned idim0, const unsigned idim1, const Array respIn, + const float min_resp, const unsigned border_len, const unsigned max_corners) { + float* x_out = xOut.get(); + float* y_out = yOut.get(); + float* resp_out = respOut.get(); + const T* resp_in = respIn.get(); // Responses on the border don't have 8-neighbors to compare, discard them const unsigned r = border_len + 1; @@ -131,10 +127,19 @@ void non_maximal( } } -static void keep_corners(float* x_out, float* y_out, float* resp_out, - const float* x_in, const float* y_in, const float* resp_in, - const unsigned* resp_idx, const unsigned n_corners) +static void keep_corners(Array xOut, Array yOut, Array respOut, + const Array xIn, const Array yIn, + const Array respIn, const Array respIdx, + const unsigned n_corners) { + float* x_out = xOut.get(); + float* y_out = yOut.get(); + float* resp_out = respOut.get(); + const float* x_in = xIn.get(); + const float* y_in = yIn.get(); + const float* resp_in = respIn.get(); + const uint* resp_idx = respIdx.get(); + // Keep only the first n_feat features for (unsigned f = 0; f < n_corners; f++) { x_out[f] = x_in[resp_idx[f]]; @@ -148,6 +153,8 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out const Array &in, const unsigned max_corners, const float min_response, const float sigma, const unsigned filter_len, const float k_thr) { + in.eval(); + dim4 idims = in.dims(); // Window filter @@ -156,8 +163,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out if (sigma < 0.5f) { for (unsigned i = 0; i < filter_len; i++) h_filter[i] = (T)1.f / (filter_len); - } - else { + } else { gaussian1D(h_filter, (int)filter_len, sigma); } Array filter = createDeviceDataArray(dim4(filter_len), (const void*)h_filter); @@ -168,15 +174,14 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out Array iy = createEmptyArray(idims); // Compute first order derivatives - gradient(iy, ix, in); + getQueue().enqueue(gradient, iy, ix, in); Array ixx = createEmptyArray(idims); Array ixy = createEmptyArray(idims); Array iyy = createEmptyArray(idims); // Compute second-order derivatives - second_order_deriv(ixx.get(), ixy.get(), iyy.get(), - in.elements(), ix.get(), iy.get()); + getQueue().enqueue(second_order_deriv, ixx, ixy, iyy, in.elements(), ix, iy); // Convolve second-order derivatives with proper window filter ixx = convolve2(ixx, filter, filter); @@ -185,26 +190,22 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out const unsigned corner_lim = in.elements() * 0.2f; - float* x_corners = memAlloc(corner_lim); - float* y_corners = memAlloc(corner_lim); - float* resp_corners = memAlloc(corner_lim); + Array responses = createEmptyArray(dim4(in.elements())); - T* resp = memAlloc(in.elements()); + getQueue().enqueue(harris_responses, responses, idims[0], idims[1], + ixx, ixy, iyy, k_thr, border_len); - // Calculate Harris responses for all pixels - harris_responses(resp, - idims[0], idims[1], - ixx.get(), ixy.get(), iyy.get(), - k_thr, border_len); + Array xCorners = createEmptyArray(dim4(corner_lim)); + Array yCorners = createEmptyArray(dim4(corner_lim)); + Array respCorners = createEmptyArray(dim4(corner_lim)); const unsigned min_r = (max_corners > 0) ? 0.f : min_response; - unsigned corners_found = 0; // Performs non-maximal suppression - non_maximal(x_corners, y_corners, resp_corners, &corners_found, - idims[0], idims[1], resp, min_r, border_len, corner_lim); - - memFree(resp); + getQueue().sync(); + unsigned corners_found = 0; + non_maximal(xCorners, yCorners, respCorners, &corners_found, + idims[0], idims[1], responses, min_r, border_len, corner_lim); const unsigned corners_out = (max_corners > 0) ? min(corners_found, max_corners) : @@ -213,42 +214,42 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out return 0; if (max_corners > 0 && corners_found > corners_out) { - Array harris_responses = createDeviceDataArray(dim4(corners_found), (void*)resp_corners); + respCorners.resetDims(dim4(corners_found)); Array harris_sorted = createEmptyArray(dim4(corners_found)); Array harris_idx = createEmptyArray(dim4(corners_found)); // Sort Harris responses - sort_index(harris_sorted, harris_idx, harris_responses, 0); + sort_index(harris_sorted, harris_idx, respCorners, 0); x_out = createEmptyArray(dim4(corners_out)); y_out = createEmptyArray(dim4(corners_out)); resp_out = createEmptyArray(dim4(corners_out)); // Keep only the corners with higher Harris responses - keep_corners(x_out.get(), y_out.get(), resp_out.get(), - x_corners, y_corners, harris_sorted.get(), harris_idx.get(), - corners_out); - - memFree(x_corners); - memFree(y_corners); - } - else if (max_corners == 0 && corners_found < corner_lim) { + getQueue().enqueue(keep_corners, x_out, y_out, resp_out, xCorners, yCorners, + harris_sorted, harris_idx, corners_out); + } else if (max_corners == 0 && corners_found < corner_lim) { x_out = createEmptyArray(dim4(corners_out)); y_out = createEmptyArray(dim4(corners_out)); resp_out = createEmptyArray(dim4(corners_out)); - memcpy(x_out.get(), x_corners, corners_out * sizeof(float)); - memcpy(y_out.get(), y_corners, corners_out * sizeof(float)); - memcpy(resp_out.get(), resp_corners, corners_out * sizeof(float)); - - memFree(x_corners); - memFree(y_corners); - memFree(resp_corners); - } - else { - x_out = createDeviceDataArray(dim4(corners_out), (void*)x_corners); - y_out = createDeviceDataArray(dim4(corners_out), (void*)y_corners); - resp_out = createDeviceDataArray(dim4(corners_out), (void*)resp_corners); + auto copyFunc = [=](Array x_out, Array y_out, + Array outResponses, const Array x_crnrs, + const Array y_crnrs, const Array inResponses, + const unsigned corners_out) { + memcpy(x_out.get(), x_crnrs.get(), corners_out * sizeof(float)); + memcpy(y_out.get(), y_crnrs.get(), corners_out * sizeof(float)); + memcpy(outResponses.get(), inResponses.get(), corners_out * sizeof(float)); + }; + getQueue().enqueue(copyFunc, x_out, y_out, resp_out, + xCorners, yCorners, respCorners, corners_out); + } else { + x_out = xCorners; + y_out = yCorners; + resp_out = respCorners; + x_out.resetDims(dim4(corners_out)); + y_out.resetDims(dim4(corners_out)); + resp_out.resetDims(dim4(corners_out)); } return corners_out; diff --git a/src/backend/cpu/homography.cpp b/src/backend/cpu/homography.cpp index d20f0ca00c..d936e21b4c 100644 --- a/src/backend/cpu/homography.cpp +++ b/src/backend/cpu/homography.cpp @@ -15,13 +15,11 @@ #include #include #include -#include #include -#include -#include #include - #include +#include +#include using af::dim4; @@ -154,12 +152,9 @@ unsigned updateIterations(float inlier_ratio, unsigned iter) } template -int computeHomography(T* H_ptr, - const float* rnd_ptr, - const float* x_src_ptr, - const float* y_src_ptr, - const float* x_dst_ptr, - const float* y_dst_ptr) +int computeHomography(T* H_ptr, const float* rnd_ptr, + const float* x_src_ptr, const float* y_src_ptr, + const float* x_dst_ptr, const float* y_dst_ptr) { if ((unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[1] || (unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[2] || (unsigned)rnd_ptr[0] == (unsigned)rnd_ptr[3] || (unsigned)rnd_ptr[1] == (unsigned)rnd_ptr[2] || @@ -192,6 +187,8 @@ int computeHomography(T* H_ptr, float dst_scale = sqrt(2.0f) / sqrt(dst_var); Array A = createValueArray(af::dim4(9, 9), (T)0); + A.eval(); + getQueue().sync(); af::dim4 Adims = A.dims(); T* A_ptr = A.get(); @@ -217,6 +214,8 @@ int computeHomography(T* H_ptr, } Array V = createValueArray(af::dim4(Adims[1], Adims[1]), (T)0); + V.eval(); + getQueue().sync(); JacobiSVD(A.get(), V.get(), 9, 9); af::dim4 Vdims = V.dims(); @@ -262,6 +261,8 @@ int findBestHomography(Array &bestH, const float* y_dst_ptr = y_dst.get(); Array H = createValueArray(af::dim4(9, iterations), (T)0); + H.eval(); + getQueue().sync(); const af::dim4 rdims = rnd.dims(); const af::dim4 Hdims = H.dims(); @@ -278,8 +279,7 @@ int findBestHomography(Array &bestH, const unsigned ridx = rdims[0] * i; const float* rnd_ptr = rnd.get() + ridx; - if (computeHomography(H_ptr, rnd_ptr, x_src_ptr, y_src_ptr, - x_dst_ptr, y_dst_ptr)) + if (computeHomography(H_ptr, rnd_ptr, x_src_ptr, y_src_ptr, x_dst_ptr, y_dst_ptr)) continue; if (htype == AF_HOMOGRAPHY_RANSAC) { @@ -320,7 +320,6 @@ int findBestHomography(Array &bestH, minMedian = median; bestIdx = i; } - } } @@ -355,6 +354,11 @@ int homography(Array &bestH, const float inlier_thr, const unsigned iterations) { + x_src.eval(); + y_src.eval(); + x_dst.eval(); + y_dst.eval(); + const af::dim4 idims = x_src.dims(); const unsigned nsamples = idims[0]; @@ -366,6 +370,8 @@ int homography(Array &bestH, Array frnd = randu(rdims); Array fctr = createValueArray(rdims, (float)nsamples); Array rnd = arithOp(frnd, fctr, rdims); + rnd.eval(); + getQueue().sync(); return findBestHomography(bestH, x_src, y_src, x_dst, y_dst, rnd, iter, nsamples, inlier_thr, htype); } From 2217014ba231b8e7ceed9ed3072d2104e3ffb243 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 11 Dec 2015 14:59:57 -0500 Subject: [PATCH 060/288] Fix in Array::device method When lambda functions are enqueued in cpu backend, the pointer that is shared by Array objects has > 1 reference count making it seem like it is referenced Array inside ::device member function. This is now fixed by syncing the operations before fetching the device pointer. --- src/api/c/assign.cpp | 2 +- src/backend/cpu/Array.hpp | 3 +++ src/backend/cpu/platform.hpp | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/api/c/assign.cpp b/src/api/c/assign.cpp index 13fa179da8..b8fcb12234 100644 --- a/src/api/c/assign.cpp +++ b/src/api/c/assign.cpp @@ -39,7 +39,7 @@ void assign(Array &out, const unsigned &ndims, const af_seq *index, const DIM_ASSERT(0, (outDs.ndims()>=iDims.ndims())); DIM_ASSERT(0, (outDs.ndims()>=(dim_t)ndims)); - evalArray(out); + out.eval(); vector index_(index, index+ndims); diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 471a6741ea..2b9cbb4fed 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include namespace cpu { @@ -162,6 +164,7 @@ namespace cpu T* device() { + getQueue().sync(); if (!isOwner() || data.use_count() > 1) { *this = Array(dims(), get(), true, true); } diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp index 9abf0755d0..10575520b5 100644 --- a/src/backend/cpu/platform.hpp +++ b/src/backend/cpu/platform.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once + #include class async_queue; From 919333eb379776267d1728b0747172112a484190 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 11 Dec 2015 17:36:18 -0500 Subject: [PATCH 061/288] Fix for getDeviceMemInfo function in cpu This changed is needed after converting the functions asynchronous --- src/backend/cpu/memory.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index ac10643c9b..73120b9171 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include namespace cpu { @@ -205,6 +207,7 @@ namespace cpu void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers) { + getQueue().sync(); if (alloc_bytes ) *alloc_bytes = total_bytes; if (alloc_buffers ) *alloc_buffers = memory_map.size(); if (lock_bytes ) *lock_bytes = used_bytes; From 6b9c157bc3e8124b45146270789d48d13e0bba20 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 14 Dec 2015 13:27:23 -0500 Subject: [PATCH 062/288] FEAT added allocHost and freeHost functions * Added corresponding C/unified API * Documentation --- docs/details/device.dox | 43 ++++++++++++++++++++++-- include/af/device.h | 68 ++++++++++++++++++++++++++++++++------ src/api/c/device.cpp | 18 ++++++++++ src/api/cpp/device.cpp | 37 +++++++++++++++------ src/api/unified/device.cpp | 10 ++++++ 5 files changed, 152 insertions(+), 24 deletions(-) diff --git a/docs/details/device.dox b/docs/details/device.dox index 230199d583..c89d2a17f0 100644 --- a/docs/details/device.dox +++ b/docs/details/device.dox @@ -62,6 +62,16 @@ allocation =============================================================================== +\defgroup device_func_free free +\ingroup device_mat + +\brief Free device memory allocated by ArrayFire's memory manager + +These calls free the device memory. These functions need to be called on +pointers allocated using alloc function. + +=============================================================================== + \defgroup device_func_pinned pinned \ingroup device_mat @@ -73,12 +83,39 @@ a limited resource. =============================================================================== -\defgroup device_func_free free +\defgroup device_func_free_pinned freePinned \ingroup device_mat -\brief Free device memory allocated by ArrayFire's memory manager +\brief Free pinned memory allocated by ArrayFire's memory manager + +These calls free the pinned memory on host. These functions need to be called on +pointers allocated using pinned function. + +=============================================================================== + +\defgroup device_func_alloc_host allocHost +\ingroup device_mat + +\brief Allocate memory on host + +This function is used for allocating regular memory on host. This is useful +where the compiler version of ArrayFire library is different from the +executable's compiler version. + +It does not use ArrayFire's memory manager. + +=============================================================================== + +\defgroup device_func_free_host freeHost +\ingroup device_mat + +\brief Free memory allocated on host internally by ArrayFire + +This function is used for freeing memory on host that was allocated within +ArrayFire. This is useful where the compiler version of ArrayFire library is +different from the executable's compiler version. -These calls free the device or pinned memory. These functions need to be called +It does not use ArrayFire's memory manager. =============================================================================== diff --git a/include/af/device.h b/include/af/device.h index 826863e6d8..03800c3ffd 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -101,6 +101,12 @@ namespace af T* alloc(const size_t elements); /// @} + /// \ingroup device_func_free + /// + /// \copydoc device_func_free + /// \param[in] ptr the memory to free + AFAPI void free(const void *ptr); + /// \ingroup device_func_pinned /// @{ /// @@ -119,15 +125,45 @@ namespace af T* pinned(const size_t elements); /// @} - /// \ingroup device_func_free - /// @{ - /// \copydoc device_func_free + /// \ingroup device_func_free_pinned + /// + /// \copydoc device_func_free_pinned /// \param[in] ptr the memory to free - AFAPI void free(const void *ptr); - - /// \copydoc free() AFAPI void freePinned(const void *ptr); - ///@} + + /// \brief Allocate memory on host + /// + /// \copydoc device_func_alloc_host + /// + /// \param[in] elements the number of elements to allocate + /// \param[in] type is the type of the elements to allocate + /// \returns the pointer to the memory + /// + /// \ingroup device_func_alloc_host + AFAPI void *allocHost(const size_t elements, const dtype type); + + /// \brief Allocate memory on host + /// + /// \copydoc device_func_alloc_host + /// + /// \param[in] elements the number of elements to allocate + /// \returns the pointer to the memory + /// + /// \note the size of the memory allocated is the number of \p elements * + /// sizeof(type) + /// + /// \ingroup device_func_alloc_host + template + AFAPI T* allocHost(const size_t elements); + + /// \brief Free memory allocated internally by ArrayFire + // + /// \copydoc device_func_free_host + /// + /// \param[in] ptr the memory to free + /// + /// \ingroup device_func_free_host + AFAPI void freeHost(const void *ptr); /// \ingroup device_func_mem /// @{ @@ -207,20 +243,30 @@ extern "C" { AFAPI af_err af_alloc_device(void **ptr, const dim_t bytes); /** - \ingroup device_func_pinned + \ingroup device_func_free */ - AFAPI af_err af_alloc_pinned(void **ptr, const dim_t bytes); + AFAPI af_err af_free_device(void *ptr); /** - \ingroup device_func_free + \ingroup device_func_pinned */ - AFAPI af_err af_free_device(void *ptr); + AFAPI af_err af_alloc_pinned(void **ptr, const dim_t bytes); /** \ingroup device_func_free_pinned */ AFAPI af_err af_free_pinned(void *ptr); + /** + \ingroup device_func_alloc_host + */ + AFAPI af_err af_alloc_host(void **ptr, const dim_t bytes); + + /** + \ingroup device_func_free_host + */ + AFAPI af_err af_free_host(void *ptr); + /** Create array from device memory \ingroup construct_mat diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 28b4cc2c49..39ad217939 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -297,6 +297,24 @@ af_err af_free_pinned(void *ptr) return AF_SUCCESS; } +af_err af_alloc_host(void **ptr, const dim_t bytes) +{ + try { + AF_CHECK(af_init()); + *ptr = malloc(bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_free_host(void *ptr) +{ + try { + AF_CHECK(af_init()); + free(ptr); + } CATCHALL; + return AF_SUCCESS; +} + af_err af_device_gc() { try { diff --git a/src/api/cpp/device.cpp b/src/api/cpp/device.cpp index bec0a60d59..622809ed06 100644 --- a/src/api/cpp/device.cpp +++ b/src/api/cpp/device.cpp @@ -140,6 +140,18 @@ namespace af AF_THROW(af_free_pinned((void *)ptr)); } + void *allocHost(const size_t elements, const af::dtype type) + { + void *ptr; + AF_THROW(af_alloc_host(&ptr, elements * size_of(type))); + return ptr; + } + + void freeHost(const void *ptr) + { + AF_THROW(af_free_host((void *)ptr)); + } + void deviceGC() { AF_THROW(af_device_gc()); @@ -164,16 +176,21 @@ namespace af return size_bytes; } -#define INSTANTIATE(T) \ - template<> AFAPI \ - T* alloc(const size_t elements) \ - { \ - return (T*)alloc(elements, (af::dtype)dtype_traits::af_type); \ - } \ - template<> AFAPI \ - T* pinned(const size_t elements) \ - { \ - return (T*)pinned(elements, (af::dtype)dtype_traits::af_type); \ +#define INSTANTIATE(T) \ + template<> AFAPI \ + T* alloc(const size_t elements) \ + { \ + return (T*)alloc(elements, (af::dtype)dtype_traits::af_type); \ + } \ + template<> AFAPI \ + T* pinned(const size_t elements) \ + { \ + return (T*)pinned(elements, (af::dtype)dtype_traits::af_type); \ + } \ + template<> AFAPI \ + T* allocHost(const size_t elements) \ + { \ + return (T*)allocHost(elements, (af::dtype)dtype_traits::af_type);\ } INSTANTIATE(float) diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp index 43559a077a..4f07788ada 100644 --- a/src/api/unified/device.cpp +++ b/src/api/unified/device.cpp @@ -95,6 +95,16 @@ af_err af_free_pinned(void *ptr) return CALL(ptr); } +af_err af_alloc_host(void **ptr, const dim_t bytes) +{ + return CALL(ptr, bytes); +} + +af_err af_free_host(void *ptr) +{ + return CALL(ptr); +} + af_err af_device_array(af_array *arr, const void *data, const unsigned ndims, const dim_t * const dims, const af_dtype type) { return CALL(arr, data, ndims, dims, type); From 3c9d69d209c815c441978dfe0a9a404972d069a5 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 14 Dec 2015 17:12:54 -0500 Subject: [PATCH 063/288] FEAT added infoString function to return info as string --- include/af/device.h | 36 +++++++++++++++++++++++++++++++++++- src/api/c/device.cpp | 13 +++++++++++++ src/api/cpp/device.cpp | 7 +++++++ src/api/unified/device.cpp | 5 +++++ 4 files changed, 60 insertions(+), 1 deletion(-) diff --git a/include/af/device.h b/include/af/device.h index 03800c3ffd..d3585c619c 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -28,6 +28,27 @@ namespace af @} */ + /** + \defgroup device_func_info_string infoString + + Get af::info() as a string + + @{ + + \brief Returns the output of af::info() as a string + + \param[in] verbose flag to return verbose info + + \returns string containing output of af::info() + + \ingroup arrayfire_func + \ingroup device_mat + */ + AFAPI const char* infoString(const bool verbose = false); + /** + @} + */ + /** \defgroup device_func_prop deviceInfo @@ -205,10 +226,23 @@ extern "C" { */ AFAPI af_err af_info(); + /** + \ingroup device_func_info + */ AFAPI af_err af_init(); /** - \ingroup device_func_info + \brief Gets the output of af_info() as a string + + \param[out] str contains the string + \param[in] verbose flag to return verbose info + + \ingroup device_func_info_string + */ + AFAPI af_err af_info_string(char** str, const bool verbose); + + /** + \ingroup device_func_prop */ AFAPI af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 39ad217939..365ccbe580 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -70,6 +70,19 @@ af_err af_info() return AF_SUCCESS; } +af_err af_info_string(char **str, const bool verbose) +{ + std::string infoStr = getInfo(); + *str = (char*)malloc(sizeof(char) * (infoStr.size() + 1)); + + // Need to do a deep copy + // str.c_str wont cut it + infoStr.copy(*str, infoStr.size()); + (*str)[infoStr.size()] = '\0'; + + return AF_SUCCESS; +} + af_err af_get_version(int *major, int *minor, int *patch) { *major = AF_VERSION_MAJOR; diff --git a/src/api/cpp/device.cpp b/src/api/cpp/device.cpp index 622809ed06..3f2441732d 100644 --- a/src/api/cpp/device.cpp +++ b/src/api/cpp/device.cpp @@ -47,6 +47,13 @@ namespace af AF_THROW(af_info()); } + const char* infoString(const bool verbose) + { + char *str = NULL; + AF_THROW(af_info_string(&str, verbose)); + return (const char *)str; + } + void deviceprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) { deviceInfo(d_name, d_platform, d_toolkit, d_compute); diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp index 4f07788ada..8f04bf6ea1 100644 --- a/src/api/unified/device.cpp +++ b/src/api/unified/device.cpp @@ -45,6 +45,11 @@ af_err af_init() return CALL_NO_PARAMS(); } +af_err af_info_string(char **str, const bool verbose) +{ + return CALL(str, verbose); +} + af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) { return CALL(d_name, d_platform, d_toolkit, d_compute); From 1de97de812499e6b969e9e23e0e35306246b96f0 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 14 Dec 2015 17:33:06 -0500 Subject: [PATCH 064/288] Using af_alloc_host when allocating user-return string --- src/api/c/device.cpp | 14 ++++++++------ src/api/c/err_common.cpp | 5 +++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 365ccbe580..84cd246a60 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -72,13 +72,15 @@ af_err af_info() af_err af_info_string(char **str, const bool verbose) { - std::string infoStr = getInfo(); - *str = (char*)malloc(sizeof(char) * (infoStr.size() + 1)); + try { + std::string infoStr = getInfo(); + af_alloc_host((void**)str, sizeof(char) * (infoStr.size() + 1)); - // Need to do a deep copy - // str.c_str wont cut it - infoStr.copy(*str, infoStr.size()); - (*str)[infoStr.size()] = '\0'; + // Need to do a deep copy + // str.c_str wont cut it + infoStr.copy(*str, infoStr.size()); + (*str)[infoStr.size()] = '\0'; + } CATCHALL; return AF_SUCCESS; } diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp index 371bbd95fa..3271423289 100644 --- a/src/api/c/err_common.cpp +++ b/src/api/c/err_common.cpp @@ -8,6 +8,7 @@ ********************************************************/ #include +#include #include #include #include @@ -156,8 +157,8 @@ void af_get_last_error(char **str, dim_t *len) *str = NULL; } - *str = new char[*len + 1]; - memcpy(*str, global_err_string.c_str(), *len * sizeof(char)); + af_alloc_host((void**)str, sizeof(char) * (*len + 1)); + global_err_string.copy(*str, *len); (*str)[*len] = '\0'; global_err_string = std::string(""); From f628fbe535163508328d18836b77c0db72275197 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 14 Dec 2015 17:49:00 -0500 Subject: [PATCH 065/288] toString now uses af_alloc_host to allocate memory --- src/api/c/print.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/c/print.cpp b/src/api/c/print.cpp index a5c178cf0c..ea84cd61a9 100644 --- a/src/api/c/print.cpp +++ b/src/api/c/print.cpp @@ -172,8 +172,8 @@ af_err af_array_to_string(char **output, const char *exp, const af_array arr, default: TYPE_ERROR(1, type); } std::string str = ss.str(); - *output = new char[str.size() + 1]; - std::copy(str.begin(), str.end(), *output); + af_alloc_host((void**)output, sizeof(char) * (str.size() + 1)); + str.copy(*output, str.size()); (*output)[str.size()] = '\0'; // don't forget the terminating 0 } CATCHALL; From 72060282fc4b94133ddb7afd2a12abdb956c7eea Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 14 Dec 2015 18:07:12 -0500 Subject: [PATCH 066/288] Add overload of toString that returns a string --- include/af/util.h | 21 ++++++++++++++++++++- src/api/cpp/util.cpp | 6 ++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/include/af/util.h b/include/af/util.h index c1fd96ab24..a56337653d 100644 --- a/include/af/util.h +++ b/include/af/util.h @@ -95,7 +95,8 @@ namespace af #if AF_API_VERSION >= 31 /** \param[out] output is the pointer to the c-string that will hold the data. The memory for - output is allocated by the function. The user is responsible for deleting the memory. + output is allocated by the function. The user is responsible for deleting the memory using + af::freeHost() or af_free_host(). \param[in] exp is an expression, generally the name of the array \param[in] arr is the input array \param[in] precision is the precision length for display @@ -108,6 +109,24 @@ namespace af const int precision = 4, const bool transpose = true); #endif +#if AF_API_VERSION >= 33 + /** + \param[in] exp is an expression, generally the name of the array + \param[in] arr is the input array + \param[in] precision is the precision length for display + \param[in] transpose determines whether or not to transpose the array before storing it in + the string + + \return output is the pointer to the c-string that will hold the data. The memory for + output is allocated by the function. The user is responsible for deleting the memory using + af::freeHost() or af_free_host(). + + \ingroup print_func_tostring + */ + AFAPI const char* toString(const char *exp, const array &arr, + const int precision = 4, const bool transpose = true); +#endif + // Purpose of Addition: "How to add Function" documentation AFAPI array exampleFunction(const array& in, const af_someenum_t param); } diff --git a/src/api/cpp/util.cpp b/src/api/cpp/util.cpp index a99b8567e0..895d347d92 100644 --- a/src/api/cpp/util.cpp +++ b/src/api/cpp/util.cpp @@ -62,4 +62,10 @@ namespace af return; } + const char* toString(const char *exp, const array &arr, const int precision, const bool transpose) + { + char *output = NULL; + AF_THROW(af_array_to_string(&output, exp, arr.get(), precision, transpose)); + return output; + } } From 06d4befbd5282b47b5731b3664ee951e3fbcbace Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 14 Dec 2015 18:44:37 -0500 Subject: [PATCH 067/288] FEAT add af_get_revision to get commit instead of AF_REVISION --- include/af/util.h | 12 +++++++++++- src/api/c/version.cpp | 16 ++++++++++++++++ src/api/unified/util.cpp | 5 +++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 src/api/c/version.cpp diff --git a/include/af/util.h b/include/af/util.h index a56337653d..eef46f47c9 100644 --- a/include/af/util.h +++ b/include/af/util.h @@ -248,10 +248,20 @@ extern "C" { AFAPI af_err af_example_function(af_array* out, const af_array in, const af_someenum_t param); /// - ///Get the version information of the library + /// Get the version information of the library /// AFAPI af_err af_get_version(int *major, int *minor, int *patch); + +#if AF_API_VERSION >= 33 + /// + /// Get the revision (commit) information of the library. + /// This returns a constant string from compile time and should not be + /// freed by the user. + /// + AFAPI const char *af_get_revision(); +#endif + #ifdef __cplusplus } #endif diff --git a/src/api/c/version.cpp b/src/api/c/version.cpp new file mode 100644 index 0000000000..4eb7883a41 --- /dev/null +++ b/src/api/c/version.cpp @@ -0,0 +1,16 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +const char *af_get_revision() +{ + return AF_REVISION; +} diff --git a/src/api/unified/util.cpp b/src/api/unified/util.cpp index 155c4f81b9..1a4dcf54a1 100644 --- a/src/api/unified/util.cpp +++ b/src/api/unified/util.cpp @@ -61,3 +61,8 @@ af_err af_get_version(int *major, int *minor, int *patch) { return CALL(major, minor, patch); } + +const char *af_get_revision() +{ + return CALL_NO_PARAMS(); +} From b3c28b6560147f93d84d1b2102074c47f8f4bbf9 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 15 Dec 2015 13:27:25 -0500 Subject: [PATCH 068/288] Using c/version.cpp in unified --- src/api/unified/CMakeLists.txt | 1 + src/api/unified/util.cpp | 5 ----- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/api/unified/CMakeLists.txt b/src/api/unified/CMakeLists.txt index 917c6dce42..a4843bb49c 100644 --- a/src/api/unified/CMakeLists.txt +++ b/src/api/unified/CMakeLists.txt @@ -18,6 +18,7 @@ FILE(GLOB common_sources "../c/util.cpp" "../c/err_common.cpp" "../c/type_util.cpp" + "../c/version.cpp" "../../backend/dim4.cpp" ) diff --git a/src/api/unified/util.cpp b/src/api/unified/util.cpp index 1a4dcf54a1..155c4f81b9 100644 --- a/src/api/unified/util.cpp +++ b/src/api/unified/util.cpp @@ -61,8 +61,3 @@ af_err af_get_version(int *major, int *minor, int *patch) { return CALL(major, minor, patch); } - -const char *af_get_revision() -{ - return CALL_NO_PARAMS(); -} From 5507717ce82024f42d1a8c9bba1514215afced5c Mon Sep 17 00:00:00 2001 From: syurkevi Date: Tue, 15 Dec 2015 17:07:43 -0500 Subject: [PATCH 069/288] add collisions, split vectors into components for performance --- examples/graphics/gravity_sim.cpp | 106 ++++++++++++++++++------------ 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/examples/graphics/gravity_sim.cpp b/examples/graphics/gravity_sim.cpp index 77f662f4db..94d321ba7c 100644 --- a/examples/graphics/gravity_sim.cpp +++ b/examples/graphics/gravity_sim.cpp @@ -15,53 +15,72 @@ using namespace af; using namespace std; static const int width = 512, height = 512; +static const int pixels_per_unit = 20; +af::array p_x; +af::array p_y; +af::array vels_x; +af::array vels_y; +af::array forces_x; +af::array forces_y; -void simulate(af::array &parts, af::array &vels, af::array &forces){ - parts += vels; +void simulate(float dt){ + p_x += vels_x * pixels_per_unit * dt; + p_y += vels_y * pixels_per_unit * dt; //calculate distance to center - float center_coors[2] = { width / 2, height / 2 }; - af::array col = tile(af::array(1, 2, center_coors), parts.dims(0)); - af::array diff = parts - col; - af::array dist = sqrt( diff.col(0)*diff.col(0) + diff.col(1)*diff.col(1) ); + af::array diff_x = p_x - width/2; + af::array diff_y = p_y - height/2; + af::array dist = sqrt( diff_x*diff_x + diff_y*diff_y ); - forces = -1 * diff; - forces.col(0) /= dist; //normalize force vectors - forces.col(1) /= dist; //normalize force vectors + //calculate normalised force vectors + forces_x = -1 * diff_x / dist; + forces_y = -1 * diff_y / dist; + //update force scaled to time and magnitude constant + forces_x *= pixels_per_unit * dt; + forces_y *= pixels_per_unit * dt; + + //dampening + vels_x *= 1 - (0.005*dt); + vels_y *= 1 - (0.005*dt); //update velocities from forces - vels += forces; + vels_x += forces_x; + vels_y += forces_y; } -void collisions(af::array &parts, af::array &vels){ +void collisions(){ //clamp particles inside screen border - parts.col(0) = min(width, max(0, parts.col(0))); - parts.col(1) = min(height - 1, max(0, parts.col(1))); + af::array projected_px = min(width, max(0, p_x)); + af::array projected_py = min(height - 1, max(0, p_y)); //calculate distance to center - float center_coors[2] = { width / 2, height / 2 }; - af::array col = tile(af::array(1, 2, center_coors), parts.dims(0)); - af::array diff = parts - col; - af::array dist = sqrt( diff.col(0)*diff.col(0) + diff.col(1)*diff.col(1) ); + af::array diff_x = projected_px - width/2; + af::array diff_y = projected_py - height/2; + af::array dist = sqrt( diff_x*diff_x + diff_y*diff_y ); - /* //collide with center sphere - int radius = 50; - af::array col_ids = dist(dist 0) { - //vels(col_ids, span) += -1 * parts(col_ids, span); - vels(col_ids, span) = 0; + const int radius = 50; + const float elastic_constant = 0.91f; + if(sum(dist 0) { + vels_x(dist Date: Wed, 16 Dec 2015 19:29:14 -0500 Subject: [PATCH 070/288] Fixes for asynchronous cpu copy && set functions Also, added a check in Array::eval to throw exception if Array::eval is being called from a queue thread. This change also includes all the regression fixes for other functions regarding this eval change. --- src/backend/cpu/Array.cpp | 2 + src/backend/cpu/approx.cpp | 7 + src/backend/cpu/assign.cpp | 4 + src/backend/cpu/blas.cpp | 6 + src/backend/cpu/copy.cpp | 12 +- src/backend/cpu/diagonal.cpp | 9 +- src/backend/cpu/index.cpp | 3 + src/backend/cpu/ireduce.cpp | 54 ++--- src/backend/cpu/morph.cpp | 6 + src/backend/cpu/reduce.cpp | 381 ++++++++++++++++---------------- src/backend/cpu/reorder.cpp | 2 + src/backend/cpu/set.cpp | 5 +- src/backend/cpu/sort_by_key.cpp | 173 ++++++++------- src/backend/cpu/svd.cpp | 5 + src/backend/cpu/tile.cpp | 2 + src/backend/cpu/transpose.cpp | 47 ++-- 16 files changed, 392 insertions(+), 326 deletions(-) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 8577374d6e..456f4c8b1f 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -8,6 +8,7 @@ ********************************************************/ #include +#include #include #include #include @@ -69,6 +70,7 @@ namespace cpu void Array::eval() { if (isReady()) return; + if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL); this->setId(getActiveDeviceId()); diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp index 87ae56f44d..4d3c8803ff 100644 --- a/src/backend/cpu/approx.cpp +++ b/src/backend/cpu/approx.cpp @@ -136,6 +136,9 @@ namespace cpu Array approx1(const Array &in, const Array &pos, const af_interp_type method, const float offGrid) { + in.eval(); + pos.eval(); + af::dim4 odims = in.dims(); odims[0] = pos.dims()[0]; @@ -305,6 +308,10 @@ namespace cpu Array approx2(const Array &in, const Array &pos0, const Array &pos1, const af_interp_type method, const float offGrid) { + in.eval(); + pos0.eval(); + pos1.eval(); + af::dim4 odims = in.dims(); odims[0] = pos0.dims()[0]; odims[1] = pos0.dims()[1]; diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp index b1578d49f6..c5d733bb17 100644 --- a/src/backend/cpu/assign.cpp +++ b/src/backend/cpu/assign.cpp @@ -41,6 +41,9 @@ dim_t trimIndex(int idx, const dim_t &len) template void assign(Array& out, const af_index_t idxrs[], const Array& rhs) { + out.eval(); + rhs.eval(); + vector isSeq(4); vector seqs(4, af_span); // create seq vector to retrieve output dimensions, offsets & offsets @@ -56,6 +59,7 @@ void assign(Array& out, const af_index_t idxrs[], const Array& rhs) for (dim_t x=0; x<4; ++x) { if (!isSeq[x]) { idxArrs[x] = castArray(idxrs[x].idx.arr); + idxArrs[x].eval(); } } diff --git a/src/backend/cpu/blas.cpp b/src/backend/cpu/blas.cpp index 3326241f10..26ec8b488b 100644 --- a/src/backend/cpu/blas.cpp +++ b/src/backend/cpu/blas.cpp @@ -147,6 +147,9 @@ template Array matmul(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { + lhs.eval(); + rhs.eval(); + CBLAS_TRANSPOSE lOpts = toCblasTranspose(optLhs); CBLAS_TRANSPOSE rOpts = toCblasTranspose(optRhs); @@ -225,6 +228,9 @@ template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { + lhs.eval(); + rhs.eval(); + Array out = createEmptyArray(af::dim4(1)); if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { getQueue().enqueue(dot_, out, lhs, rhs, optLhs, optRhs); diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 80f28dae13..52403605ca 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -48,7 +48,7 @@ namespace cpu template void copyData(T *to, const Array &from) { - evalArray(from); + from.eval(); getQueue().sync(); if(from.isOwner()) { // FIXME: Check for errors / exceptions @@ -118,16 +118,18 @@ namespace cpu template void multiply_inplace(Array &in, double val) { + in.eval(); getQueue().enqueue(copy, in, in, 0, val); } template - Array - padArray(Array const &in, dim4 const &dims, - outType default_value, double factor) + Array padArray(Array const &in, dim4 const &dims, + outType default_value, double factor) { Array ret = createValueArray(dims, default_value); ret.eval(); + in.eval(); + // FIXME: getQueue().sync(); getQueue().enqueue(copy, ret, in, outType(default_value), factor); return ret; @@ -136,6 +138,8 @@ namespace cpu template void copyArray(Array &out, Array const &in) { + out.eval(); + in.eval(); getQueue().enqueue(copy, out, in, scalar(0), 1.0); } diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp index 182027d8e7..856ed6ed44 100644 --- a/src/backend/cpu/diagonal.cpp +++ b/src/backend/cpu/diagonal.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,8 @@ namespace cpu template Array diagCreate(const Array &in, const int num) { + in.eval(); + int size = in.dims()[0] + std::abs(num); int batch = in.dims()[1]; Array out = createEmptyArray(dim4(size, size, batch)); @@ -52,12 +55,14 @@ namespace cpu template Array diagExtract(const Array &in, const int num) { - const dim_t *idims = in.dims().get(); + in.eval(); + + const dim4 idims = in.dims(); dim_t size = std::max(idims[0], idims[1]) - std::abs(num); Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); auto func = [=] (Array out, const Array in) { - const dim_t *odims = out.dims().get(); + const dim4 odims = out.dims(); const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp index c1beeea9c0..68c2f16a23 100644 --- a/src/backend/cpu/index.cpp +++ b/src/backend/cpu/index.cpp @@ -41,6 +41,8 @@ dim_t trimIndex(dim_t idx, const dim_t &len) template Array index(const Array& in, const af_index_t idxrs[]) { + in.eval(); + vector isSeq(4); vector seqs(4, af_span); // create seq vector to retrieve output @@ -60,6 +62,7 @@ Array index(const Array& in, const af_index_t idxrs[]) for (dim_t x=0; x(idxrs[x].idx.arr); + idxArrs[x].eval(); // set output array ith dimension value oDims[x] = idxArrs[x].elements(); } diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp index 7f4b03c2c5..e562bae068 100644 --- a/src/backend/cpu/ireduce.cpp +++ b/src/backend/cpu/ireduce.cpp @@ -71,19 +71,16 @@ namespace cpu template struct ireduce_dim { - void operator()(T *out, const dim4 ostrides, const dim4 odims, - uint *loc, - const T *in , const dim4 istrides, const dim4 idims, - const int dim) + void operator()(Array output, Array locArray, const dim_t outOffset, + const Array input, const dim_t inOffset, const int dim) { + const dim4 odims = output.dims(); + const dim4 ostrides = output.strides(); + const dim4 istrides = input.strides(); const int D1 = D - 1; for (dim_t i = 0; i < odims[D1]; i++) { - ireduce_dim()(out + i * ostrides[D1], - ostrides, odims, - loc + i * ostrides[D1], - in + i * istrides[D1], - istrides, idims, - dim); + ireduce_dim()(output, locArray, outOffset + i * ostrides[D1], + input, inOffset + i * istrides[D1], dim); } } }; @@ -91,33 +88,38 @@ namespace cpu template struct ireduce_dim { - void operator()(T *out, const dim4 ostrides, const dim4 odims, - uint *loc, - const T *in , const dim4 istrides, const dim4 idims, - const int dim) + void operator()(Array output, Array locArray, const dim_t outOffset, + const Array input, const dim_t inOffset, const int dim) { + const dim4 idims = input.dims(); + const dim4 istrides = input.strides(); + + T const * const in = input.get(); + T * out = output.get(); + uint * loc = locArray.get(); dim_t stride = istrides[dim]; MinMaxOp Op(in[0], 0); for (dim_t i = 0; i < idims[dim]; i++) { - Op(in[i * stride], i); + Op(in[inOffset + i * stride], i); } - *out = Op.m_val; - *loc = Op.m_idx; + *(out+outOffset) = Op.m_val; + *(loc+outOffset) = Op.m_idx; } }; template - using ireduce_dim_func = std::function; + using ireduce_dim_func = std::function, Array, const dim_t, + const Array, const dim_t, const int)>; template - void ireduce(Array &out, Array &loc, - const Array &in, const int dim) + void ireduce(Array &out, Array &loc, const Array &in, const int dim) { + out.eval(); + loc.eval(); + in.eval(); + dim4 odims = in.dims(); odims[dim] = 1; static const ireduce_dim_func ireduce_funcs[] = { ireduce_dim() @@ -125,15 +127,15 @@ namespace cpu , ireduce_dim() , ireduce_dim()}; - getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out.get(), out.strides(), out.dims(), - loc.get(), in.get(), in.strides(), in.dims(), dim); + getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); } template T ireduce_all(unsigned *loc, const Array &in) { - evalArray(in); + in.eval(); getQueue().sync(); + af::dim4 dims = in.dims(); af::dim4 strides = in.strides(); const T *inPtr = in.get(); diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp index c64d09be30..945c32b310 100644 --- a/src/backend/cpu/morph.cpp +++ b/src/backend/cpu/morph.cpp @@ -33,6 +33,9 @@ static inline unsigned getIdx(const dim4 &strides, template Array morph(const Array &in, const Array &mask) { + in.eval(); + mask.eval(); + Array out = createEmptyArray(in.dims()); auto func = [=] (Array out, const Array in, const Array mask) { @@ -96,6 +99,9 @@ Array morph(const Array &in, const Array &mask) template Array morph3d(const Array &in, const Array &mask) { + in.eval(); + mask.eval(); + Array out = createEmptyArray(in.dims()); auto func = [=] (Array out, const Array in, const Array mask) { diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp index e01f0c51f1..cce12268e8 100644 --- a/src/backend/cpu/reduce.cpp +++ b/src/backend/cpu/reduce.cpp @@ -37,220 +37,229 @@ struct Binary namespace cpu { - template - struct reduce_dim - { - void operator()(To *out, const dim4 &ostrides, const dim4 &odims, - const Ti *in , const dim4 &istrides, const dim4 &idims, - const int dim, bool change_nan, double nanval) - { - static const int D1 = D - 1; - static reduce_dim reduce_dim_next; - for (dim_t i = 0; i < odims[D1]; i++) { - reduce_dim_next(out + i * ostrides[D1], - ostrides, odims, - in + i * istrides[D1], - istrides, idims, - dim, change_nan, nanval); - } - } - }; - template - struct reduce_dim +template +struct reduce_dim +{ + void operator()(Array out, const dim_t outOffset, + const Array in, const dim_t inOffset, + const int dim, bool change_nan, double nanval) { + static const int D1 = D - 1; + static reduce_dim reduce_dim_next; - Transform transform; - Binary reduce; - void operator()(To *out, const dim4 &ostrides, const dim4 &odims, - const Ti *in , const dim4 &istrides, const dim4 &idims, - const int dim, bool change_nan, double nanval) - { - dim_t stride = istrides[dim]; - - To out_val = reduce.init(); - for (dim_t i = 0; i < idims[dim]; i++) { - To in_val = transform(in[i * stride]); - if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; - out_val = reduce(in_val, out_val); - } + const dim4 ostrides = out.strides(); + const dim4 istrides = in.strides(); + const dim4 odims = out.dims(); - *out = out_val; + for (dim_t i = 0; i < odims[D1]; i++) { + reduce_dim_next(out, outOffset + i * ostrides[D1], + in, inOffset + i * istrides[D1], + dim, change_nan, nanval); } - }; + } +}; - template - using reduce_dim_func = std::function; +template +struct reduce_dim +{ - template - Array reduce(const Array &in, const int dim, bool change_nan, double nanval) + Transform transform; + Binary reduce; + void operator()(Array out, const dim_t outOffset, + const Array in, const dim_t inOffset, + const int dim, bool change_nan, double nanval) { - dim4 odims = in.dims(); - odims[dim] = 1; - in.eval(); + const dim4 istrides = in.strides(); + const dim4 idims = in.dims(); + + To * const outPtr = out.get() + outOffset; + Ti const * const inPtr = in.get() + inOffset; + dim_t stride = istrides[dim]; + + To out_val = reduce.init(); + for (dim_t i = 0; i < idims[dim]; i++) { + To in_val = transform(inPtr[i * stride]); + if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; + out_val = reduce(in_val, out_val); + } - Array out = createEmptyArray(odims); - static const reduce_dim_func reduce_funcs[4] = { reduce_dim() - , reduce_dim() - , reduce_dim() - , reduce_dim()}; + *outPtr = out_val; + } +}; - getQueue().enqueue(reduce_funcs[in.ndims() - 1],out.get(), out.strides(), out.dims(), - in.get(), in.strides(), in.dims(), dim, - change_nan, nanval); +template +using reduce_dim_func = std::function, const dim_t, + const Array, const dim_t, + const int, bool, double)>; - return out; - } +template +Array reduce(const Array &in, const int dim, bool change_nan, double nanval) +{ + dim4 odims = in.dims(); + odims[dim] = 1; + in.eval(); - template - To reduce_all(const Array &in, bool change_nan, double nanval) - { - evalArray(in); - getQueue().sync(); - Transform transform; - Binary reduce; + Array out = createEmptyArray(odims); + static const reduce_dim_func reduce_funcs[4] = { reduce_dim() + , reduce_dim() + , reduce_dim() + , reduce_dim()}; + + getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval); + + return out; +} + +template +To reduce_all(const Array &in, bool change_nan, double nanval) +{ + in.eval(); + getQueue().sync(); + + Transform transform; + Binary reduce; - To out = reduce.init(); + To out = reduce.init(); - // Decrement dimension of select dimension - af::dim4 dims = in.dims(); - af::dim4 strides = in.strides(); - const Ti *inPtr = in.get(); + // Decrement dimension of select dimension + af::dim4 dims = in.dims(); + af::dim4 strides = in.strides(); + const Ti *inPtr = in.get(); - for(dim_t l = 0; l < dims[3]; l++) { - dim_t off3 = l * strides[3]; + for(dim_t l = 0; l < dims[3]; l++) { + dim_t off3 = l * strides[3]; - for(dim_t k = 0; k < dims[2]; k++) { - dim_t off2 = k * strides[2]; + for(dim_t k = 0; k < dims[2]; k++) { + dim_t off2 = k * strides[2]; - for(dim_t j = 0; j < dims[1]; j++) { - dim_t off1 = j * strides[1]; + for(dim_t j = 0; j < dims[1]; j++) { + dim_t off1 = j * strides[1]; - for(dim_t i = 0; i < dims[0]; i++) { - dim_t idx = i + off1 + off2 + off3; + for(dim_t i = 0; i < dims[0]; i++) { + dim_t idx = i + off1 + off2 + off3; - To in_val = transform(inPtr[idx]); - if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; - out = reduce(in_val, out); - } + To in_val = transform(inPtr[idx]); + if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; + out = reduce(in_val, out); } } } - - return out; } + return out; +} + #define INSTANTIATE(ROp, Ti, To) \ template Array reduce(const Array &in, const int dim, \ bool change_nan, double nanval); \ template To reduce_all(const Array &in, \ bool change_nan, double nanval); - //min - INSTANTIATE(af_min_t, float , float ) - INSTANTIATE(af_min_t, double , double ) - INSTANTIATE(af_min_t, cfloat , cfloat ) - INSTANTIATE(af_min_t, cdouble, cdouble) - INSTANTIATE(af_min_t, int , int ) - INSTANTIATE(af_min_t, uint , uint ) - INSTANTIATE(af_min_t, intl , intl ) - INSTANTIATE(af_min_t, uintl , uintl ) - INSTANTIATE(af_min_t, char , char ) - INSTANTIATE(af_min_t, uchar , uchar ) - INSTANTIATE(af_min_t, short , short ) - INSTANTIATE(af_min_t, ushort , ushort ) - - //max - INSTANTIATE(af_max_t, float , float ) - INSTANTIATE(af_max_t, double , double ) - INSTANTIATE(af_max_t, cfloat , cfloat ) - INSTANTIATE(af_max_t, cdouble, cdouble) - INSTANTIATE(af_max_t, int , int ) - INSTANTIATE(af_max_t, uint , uint ) - INSTANTIATE(af_max_t, intl , intl ) - INSTANTIATE(af_max_t, uintl , uintl ) - INSTANTIATE(af_max_t, char , char ) - INSTANTIATE(af_max_t, uchar , uchar ) - INSTANTIATE(af_max_t, short , short ) - INSTANTIATE(af_max_t, ushort , ushort ) - - //sum - INSTANTIATE(af_add_t, float , float ) - INSTANTIATE(af_add_t, double , double ) - INSTANTIATE(af_add_t, cfloat , cfloat ) - INSTANTIATE(af_add_t, cdouble, cdouble) - INSTANTIATE(af_add_t, int , int ) - INSTANTIATE(af_add_t, int , float ) - INSTANTIATE(af_add_t, uint , uint ) - INSTANTIATE(af_add_t, uint , float ) - INSTANTIATE(af_add_t, intl , intl ) - INSTANTIATE(af_add_t, intl , double ) - INSTANTIATE(af_add_t, uintl , uintl ) - INSTANTIATE(af_add_t, uintl , double ) - INSTANTIATE(af_add_t, char , int ) - INSTANTIATE(af_add_t, char , float ) - INSTANTIATE(af_add_t, uchar , uint ) - INSTANTIATE(af_add_t, uchar , float ) - INSTANTIATE(af_add_t, short , int ) - INSTANTIATE(af_add_t, short , float ) - INSTANTIATE(af_add_t, ushort , uint ) - INSTANTIATE(af_add_t, ushort , float ) - - //mul - INSTANTIATE(af_mul_t, float , float ) - INSTANTIATE(af_mul_t, double , double ) - INSTANTIATE(af_mul_t, cfloat , cfloat ) - INSTANTIATE(af_mul_t, cdouble, cdouble) - INSTANTIATE(af_mul_t, int , int ) - INSTANTIATE(af_mul_t, uint , uint ) - INSTANTIATE(af_mul_t, intl , intl ) - INSTANTIATE(af_mul_t, uintl , uintl ) - INSTANTIATE(af_mul_t, char , int ) - INSTANTIATE(af_mul_t, uchar , uint ) - INSTANTIATE(af_mul_t, short , int ) - INSTANTIATE(af_mul_t, ushort , uint ) - - // count - INSTANTIATE(af_notzero_t, float , uint) - INSTANTIATE(af_notzero_t, double , uint) - INSTANTIATE(af_notzero_t, cfloat , uint) - INSTANTIATE(af_notzero_t, cdouble, uint) - INSTANTIATE(af_notzero_t, int , uint) - INSTANTIATE(af_notzero_t, uint , uint) - INSTANTIATE(af_notzero_t, intl , uint) - INSTANTIATE(af_notzero_t, uintl , uint) - INSTANTIATE(af_notzero_t, char , uint) - INSTANTIATE(af_notzero_t, uchar , uint) - INSTANTIATE(af_notzero_t, short , uint) - INSTANTIATE(af_notzero_t, ushort , uint) - - //anytrue - INSTANTIATE(af_or_t, float , char) - INSTANTIATE(af_or_t, double , char) - INSTANTIATE(af_or_t, cfloat , char) - INSTANTIATE(af_or_t, cdouble, char) - INSTANTIATE(af_or_t, int , char) - INSTANTIATE(af_or_t, uint , char) - INSTANTIATE(af_or_t, intl , char) - INSTANTIATE(af_or_t, uintl , char) - INSTANTIATE(af_or_t, char , char) - INSTANTIATE(af_or_t, uchar , char) - INSTANTIATE(af_or_t, short , char) - INSTANTIATE(af_or_t, ushort , char) - - //alltrue - INSTANTIATE(af_and_t, float , char) - INSTANTIATE(af_and_t, double , char) - INSTANTIATE(af_and_t, cfloat , char) - INSTANTIATE(af_and_t, cdouble, char) - INSTANTIATE(af_and_t, int , char) - INSTANTIATE(af_and_t, uint , char) - INSTANTIATE(af_and_t, intl , char) - INSTANTIATE(af_and_t, uintl , char) - INSTANTIATE(af_and_t, char , char) - INSTANTIATE(af_and_t, uchar , char) - INSTANTIATE(af_and_t, short , char) - INSTANTIATE(af_and_t, ushort , char) +//min +INSTANTIATE(af_min_t, float , float ) +INSTANTIATE(af_min_t, double , double ) +INSTANTIATE(af_min_t, cfloat , cfloat ) +INSTANTIATE(af_min_t, cdouble, cdouble) +INSTANTIATE(af_min_t, int , int ) +INSTANTIATE(af_min_t, uint , uint ) +INSTANTIATE(af_min_t, intl , intl ) +INSTANTIATE(af_min_t, uintl , uintl ) +INSTANTIATE(af_min_t, char , char ) +INSTANTIATE(af_min_t, uchar , uchar ) +INSTANTIATE(af_min_t, short , short ) +INSTANTIATE(af_min_t, ushort , ushort ) + +//max +INSTANTIATE(af_max_t, float , float ) +INSTANTIATE(af_max_t, double , double ) +INSTANTIATE(af_max_t, cfloat , cfloat ) +INSTANTIATE(af_max_t, cdouble, cdouble) +INSTANTIATE(af_max_t, int , int ) +INSTANTIATE(af_max_t, uint , uint ) +INSTANTIATE(af_max_t, intl , intl ) +INSTANTIATE(af_max_t, uintl , uintl ) +INSTANTIATE(af_max_t, char , char ) +INSTANTIATE(af_max_t, uchar , uchar ) +INSTANTIATE(af_max_t, short , short ) +INSTANTIATE(af_max_t, ushort , ushort ) + +//sum +INSTANTIATE(af_add_t, float , float ) +INSTANTIATE(af_add_t, double , double ) +INSTANTIATE(af_add_t, cfloat , cfloat ) +INSTANTIATE(af_add_t, cdouble, cdouble) +INSTANTIATE(af_add_t, int , int ) +INSTANTIATE(af_add_t, int , float ) +INSTANTIATE(af_add_t, uint , uint ) +INSTANTIATE(af_add_t, uint , float ) +INSTANTIATE(af_add_t, intl , intl ) +INSTANTIATE(af_add_t, intl , double ) +INSTANTIATE(af_add_t, uintl , uintl ) +INSTANTIATE(af_add_t, uintl , double ) +INSTANTIATE(af_add_t, char , int ) +INSTANTIATE(af_add_t, char , float ) +INSTANTIATE(af_add_t, uchar , uint ) +INSTANTIATE(af_add_t, uchar , float ) +INSTANTIATE(af_add_t, short , int ) +INSTANTIATE(af_add_t, short , float ) +INSTANTIATE(af_add_t, ushort , uint ) +INSTANTIATE(af_add_t, ushort , float ) + +//mul +INSTANTIATE(af_mul_t, float , float ) +INSTANTIATE(af_mul_t, double , double ) +INSTANTIATE(af_mul_t, cfloat , cfloat ) +INSTANTIATE(af_mul_t, cdouble, cdouble) +INSTANTIATE(af_mul_t, int , int ) +INSTANTIATE(af_mul_t, uint , uint ) +INSTANTIATE(af_mul_t, intl , intl ) +INSTANTIATE(af_mul_t, uintl , uintl ) +INSTANTIATE(af_mul_t, char , int ) +INSTANTIATE(af_mul_t, uchar , uint ) +INSTANTIATE(af_mul_t, short , int ) +INSTANTIATE(af_mul_t, ushort , uint ) + +// count +INSTANTIATE(af_notzero_t, float , uint) +INSTANTIATE(af_notzero_t, double , uint) +INSTANTIATE(af_notzero_t, cfloat , uint) +INSTANTIATE(af_notzero_t, cdouble, uint) +INSTANTIATE(af_notzero_t, int , uint) +INSTANTIATE(af_notzero_t, uint , uint) +INSTANTIATE(af_notzero_t, intl , uint) +INSTANTIATE(af_notzero_t, uintl , uint) +INSTANTIATE(af_notzero_t, char , uint) +INSTANTIATE(af_notzero_t, uchar , uint) +INSTANTIATE(af_notzero_t, short , uint) +INSTANTIATE(af_notzero_t, ushort , uint) + +//anytrue +INSTANTIATE(af_or_t, float , char) +INSTANTIATE(af_or_t, double , char) +INSTANTIATE(af_or_t, cfloat , char) +INSTANTIATE(af_or_t, cdouble, char) +INSTANTIATE(af_or_t, int , char) +INSTANTIATE(af_or_t, uint , char) +INSTANTIATE(af_or_t, intl , char) +INSTANTIATE(af_or_t, uintl , char) +INSTANTIATE(af_or_t, char , char) +INSTANTIATE(af_or_t, uchar , char) +INSTANTIATE(af_or_t, short , char) +INSTANTIATE(af_or_t, ushort , char) + +//alltrue +INSTANTIATE(af_and_t, float , char) +INSTANTIATE(af_and_t, double , char) +INSTANTIATE(af_and_t, cfloat , char) +INSTANTIATE(af_and_t, cdouble, char) +INSTANTIATE(af_and_t, int , char) +INSTANTIATE(af_and_t, uint , char) +INSTANTIATE(af_and_t, intl , char) +INSTANTIATE(af_and_t, uintl , char) +INSTANTIATE(af_and_t, char , char) +INSTANTIATE(af_and_t, uchar , char) +INSTANTIATE(af_and_t, short , char) +INSTANTIATE(af_and_t, ushort , char) + } diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp index 7d7558265c..afe562001d 100644 --- a/src/backend/cpu/reorder.cpp +++ b/src/backend/cpu/reorder.cpp @@ -53,6 +53,8 @@ namespace cpu template Array reorder(const Array &in, const af::dim4 &rdims) { + in.eval(); + const af::dim4 iDims = in.dims(); af::dim4 oDims(0); for(int i = 0; i < 4; i++) diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp index d9ca0849c0..67aa5863ea 100644 --- a/src/backend/cpu/set.cpp +++ b/src/backend/cpu/set.cpp @@ -31,12 +31,15 @@ namespace cpu const bool is_sorted) { in.eval(); - getQueue().sync(); Array out = createEmptyArray(af::dim4()); if (is_sorted) out = copyArray(in); else out = sort(in, 0); + // Need to sync old jobs since we need to + // operator on pointers directly in std::unique + getQueue().sync(); + T *ptr = out.get(); T *last = std::unique(ptr, ptr + in.elements()); dim_t dist = (dim_t)std::distance(ptr, last); diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index 684b9bac58..d2ebd4296d 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -27,84 +27,92 @@ using std::async; namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - - template - void sort0_by_key(Array okey, Array oval, const Array ikey, const Array ival) - { - function op = greater(); - if(isAscending) { op = less(); } - - // Get pointers and initialize original index locations - Array oidx = createValueArray(ikey.dims(), 0u); - uint *oidx_ptr = oidx.get(); - Tk *okey_ptr = okey.get(); - Tv *oval_ptr = oval.get(); - const Tk *ikey_ptr = ikey.get(); - const Tv *ival_ptr = ival.get(); - - std::vector seq_vec(oidx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); - - const Tk *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < ikey.dims()[3]; w++) { - dim_t okeyW = w * okey.strides()[3]; - dim_t ovalW = w * oval.strides()[3]; - dim_t oidxW = w * oidx.strides()[3]; - dim_t ikeyW = w * ikey.strides()[3]; - dim_t ivalW = w * ival.strides()[3]; - - for(dim_t z = 0; z < ikey.dims()[2]; z++) { - dim_t okeyWZ = okeyW + z * okey.strides()[2]; - dim_t ovalWZ = ovalW + z * oval.strides()[2]; - dim_t oidxWZ = oidxW + z * oidx.strides()[2]; - dim_t ikeyWZ = ikeyW + z * ikey.strides()[2]; - dim_t ivalWZ = ivalW + z * ival.strides()[2]; - - for(dim_t y = 0; y < ikey.dims()[1]; y++) { - - dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; - dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; - dim_t oidxOffset = oidxWZ + y * oidx.strides()[1]; - dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1]; - dim_t ivalOffset = ivalWZ + y * ival.strides()[1]; - - uint *ptr = oidx_ptr + oidxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); - - comp_ptr = ikey_ptr + ikeyOffset; - std::stable_sort(ptr, ptr + ikey.dims()[0], comparator); - - for (dim_t i = 0; i < oval.dims()[0]; ++i){ - uint sortIdx = oidx_ptr[oidxOffset + i]; - okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx]; - oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx]; - } + +/////////////////////////////////////////////////////////////////////////// +// Kernel Functions +/////////////////////////////////////////////////////////////////////////// + +template +void sort0_by_key(Array okey, Array oval, Array oidx, + const Array ikey, const Array ival) +{ + function op = greater(); + if(isAscending) { op = less(); } + + // Get pointers and initialize original index locations + uint *oidx_ptr = oidx.get(); + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); + const Tk *ikey_ptr = ikey.get(); + const Tv *ival_ptr = ival.get(); + + std::vector seq_vec(oidx.dims()[0]); + std::iota(seq_vec.begin(), seq_vec.end(), 0); + + const Tk *comp_ptr = nullptr; + auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; + + for(dim_t w = 0; w < ikey.dims()[3]; w++) { + dim_t okeyW = w * okey.strides()[3]; + dim_t ovalW = w * oval.strides()[3]; + dim_t oidxW = w * oidx.strides()[3]; + dim_t ikeyW = w * ikey.strides()[3]; + dim_t ivalW = w * ival.strides()[3]; + + for(dim_t z = 0; z < ikey.dims()[2]; z++) { + dim_t okeyWZ = okeyW + z * okey.strides()[2]; + dim_t ovalWZ = ovalW + z * oval.strides()[2]; + dim_t oidxWZ = oidxW + z * oidx.strides()[2]; + dim_t ikeyWZ = ikeyW + z * ikey.strides()[2]; + dim_t ivalWZ = ivalW + z * ival.strides()[2]; + + for(dim_t y = 0; y < ikey.dims()[1]; y++) { + + dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; + dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; + dim_t oidxOffset = oidxWZ + y * oidx.strides()[1]; + dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1]; + dim_t ivalOffset = ivalWZ + y * ival.strides()[1]; + + uint *ptr = oidx_ptr + oidxOffset; + std::copy(seq_vec.begin(), seq_vec.end(), ptr); + + comp_ptr = ikey_ptr + ikeyOffset; + std::stable_sort(ptr, ptr + ikey.dims()[0], comparator); + + for (dim_t i = 0; i < oval.dims()[0]; ++i){ + uint sortIdx = oidx_ptr[oidxOffset + i]; + okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx]; + oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx]; } } } - - return; } - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - void sort_by_key(Array &okey, Array &oval, - const Array &ikey, const Array &ival, const uint dim) - { - okey = createEmptyArray(ikey.dims()); - oval = createEmptyArray(ival.dims()); - switch(dim) { - case 0: getQueue().enqueue(sort0_by_key, okey, oval, ikey, ival); break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); - } + return; +} + +/////////////////////////////////////////////////////////////////////////// +// Wrapper Functions +/////////////////////////////////////////////////////////////////////////// +template +void sort_by_key(Array &okey, Array &oval, + const Array &ikey, const Array &ival, const uint dim) +{ + ikey.eval(); + ival.eval(); + + okey = createEmptyArray(ikey.dims()); + oval = createEmptyArray(ival.dims()); + Array oidx = createValueArray(ikey.dims(), 0u); + oidx.eval(); + + switch(dim) { + case 0: getQueue().enqueue(sort0_by_key, + okey, oval, oidx, ikey, ival); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } +} #define INSTANTIATE(Tk, Tv) \ template void \ @@ -127,14 +135,15 @@ namespace cpu INSTANTIATE(Tk, uintl) \ - INSTANTIATE1(float) - INSTANTIATE1(double) - INSTANTIATE1(int) - INSTANTIATE1(uint) - INSTANTIATE1(char) - INSTANTIATE1(uchar) - INSTANTIATE1(short) - INSTANTIATE1(ushort) - INSTANTIATE1(intl) - INSTANTIATE1(uintl) +INSTANTIATE1(float) +INSTANTIATE1(double) +INSTANTIATE1(int) +INSTANTIATE1(uint) +INSTANTIATE1(char) +INSTANTIATE1(uchar) +INSTANTIATE1(short) +INSTANTIATE1(ushort) +INSTANTIATE1(intl) +INSTANTIATE1(uintl) + } diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp index 33bfab75aa..39cbb66343 100644 --- a/src/backend/cpu/svd.cpp +++ b/src/backend/cpu/svd.cpp @@ -68,6 +68,11 @@ namespace cpu template void svdInPlace(Array &s, Array &u, Array &vt, Array &in) { + s.eval(); + u.eval(); + vt.eval(); + in.eval(); + auto func = [=] (Array s, Array u, Array vt, Array in) { dim4 iDims = in.dims(); int M = iDims[0]; diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp index f7560121f4..4f035450ae 100644 --- a/src/backend/cpu/tile.cpp +++ b/src/backend/cpu/tile.cpp @@ -20,6 +20,8 @@ namespace cpu template Array tile(const Array &in, const af::dim4 &tileDims) { + in.eval(); + const af::dim4 iDims = in.dims(); af::dim4 oDims = iDims; oDims *= tileDims; diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp index c89243bffd..c3a8a37a72 100644 --- a/src/backend/cpu/transpose.cpp +++ b/src/backend/cpu/transpose.cpp @@ -52,9 +52,15 @@ cdouble getConjugate(const cdouble &in) } template -void transpose_(T *out, const T *in, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides) +void transpose_(Array output, const Array input) { + const dim4 odims = output.dims(); + const dim4 ostrides = output.strides(); + const dim4 istrides = input.strides(); + + T * out = output.get(); + T const * const in = input.get(); + for (dim_t l = 0; l < odims[3]; ++l) { for (dim_t k = 0; k < odims[2]; ++k) { // Outermost loop handles batch mode @@ -82,35 +88,32 @@ void transpose_(T *out, const T *in, const af::dim4 &odims, const af::dim4 &idim template void transpose_(Array out, const Array in, const bool conjugate) { - // get data pointers for input and output Arrays - T* outData = out.get(); - const T* inData = in.get(); - - if(conjugate) { - transpose_(outData, inData, - out.dims(), in.dims(), out.strides(), in.strides()); - } else { - transpose_(outData, inData, - out.dims(), in.dims(), out.strides(), in.strides()); - } + return (conjugate ? transpose_(out, in) : transpose_(out, in)); } template Array transpose(const Array &in, const bool conjugate) { - const dim4 inDims = in.dims(); - - dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]); + in.eval(); + const dim4 inDims = in.dims(); + const dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]); // create an array with first two dimensions swapped Array out = createEmptyArray(outDims); + getQueue().enqueue(transpose_, out, in, conjugate); + return out; } template -void transpose_inplace(T *in, const af::dim4 &idims, const af::dim4 &istrides) +void transpose_inplace(Array input) { + const dim4 idims = input.dims(); + const dim4 istrides = input.strides(); + + T * in = input.get(); + for (dim_t l = 0; l < idims[3]; ++l) { for (dim_t k = 0; k < idims[2]; ++k) { // Outermost loop handles batch mode @@ -141,19 +144,13 @@ void transpose_inplace(T *in, const af::dim4 &idims, const af::dim4 &istrides) template void transpose_inplace_(Array in, const bool conjugate) { - // get data pointers for input and output Arrays - T* inData = in.get(); - - if(conjugate) { - transpose_inplace(inData, in.dims(), in.strides()); - } else { - transpose_inplace(inData, in.dims(), in.strides()); - } + return (conjugate ? transpose_inplace(in) : transpose_inplace(in)); } template void transpose_inplace(Array &in, const bool conjugate) { + in.eval(); getQueue().enqueue(transpose_inplace_, in, conjugate); } From 3ba9633e8d1dba645ba29695ef99e11a616ef2f2 Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 17 Dec 2015 14:22:46 -0500 Subject: [PATCH 071/288] Added missing eval for input Array's in cpu backend fns This change has some style fixes related to cpu namespace --- src/backend/cpu/Array.cpp | 402 +++--- src/backend/cpu/approx.cpp | 584 ++++---- src/backend/cpu/bilateral.cpp | 1 + src/backend/cpu/cholesky.cpp | 4 + src/backend/cpu/copy.cpp | 278 ++-- src/backend/cpu/diagonal.cpp | 125 +- src/backend/cpu/diff.cpp | 205 +-- src/backend/cpu/exampleFunction.cpp | 7 + src/backend/cpu/fft.cpp | 6 + src/backend/cpu/gradient.cpp | 1 + src/backend/cpu/identity.cpp | 2 + src/backend/cpu/iota.cpp | 2 + src/backend/cpu/ireduce.cpp | 282 ++-- src/backend/cpu/lu.cpp | 6 + src/backend/cpu/match_template.cpp | 3 + src/backend/cpu/math.cpp | 72 +- src/backend/cpu/meanshift.cpp | 2 + src/backend/cpu/medfilt.cpp | 2 + src/backend/cpu/memory.cpp | 333 ++--- src/backend/cpu/nearest_neighbour.cpp | 2 - src/backend/cpu/qr.cpp | 6 + src/backend/cpu/range.cpp | 2 + src/backend/cpu/reorder.cpp | 98 +- src/backend/cpu/resize.cpp | 2 + src/backend/cpu/rotate.cpp | 178 +-- src/backend/cpu/set.cpp | 178 +-- src/backend/cpu/shift.cpp | 6 +- src/backend/cpu/sift_nonfree.hpp | 1811 +++++++++++++------------ src/backend/cpu/sobel.cpp | 1 + src/backend/cpu/solve.cpp | 11 +- src/backend/cpu/sort.cpp | 96 +- src/backend/cpu/sort_index.cpp | 140 +- src/backend/cpu/susan.cpp | 2 + src/backend/cpu/svd.cpp | 147 +- src/backend/cpu/transform.cpp | 4 +- src/backend/cpu/transpose.cpp | 1 - src/backend/cpu/triangle.cpp | 25 +- src/backend/cpu/unwrap.cpp | 3 +- src/backend/cpu/where.cpp | 81 +- 39 files changed, 2606 insertions(+), 2505 deletions(-) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 456f4c8b1f..9c15bc46c6 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -21,250 +21,251 @@ namespace cpu { - const int MAX_TNJ_LEN = 20; - using TNJ::BufferNode; - using TNJ::Node; - using TNJ::Node_ptr; - - using af::dim4; - - template - Array::Array(dim4 dims): - info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), - data(memAlloc(dims.elements()), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) - { } - - template - Array::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device): - info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), - data((is_device & !copy_device) ? (T*)in_data : memAlloc(dims.elements()), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) - { - static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); - static_assert(offsetof(Array, info) == 0, "Array::info must be the first member variable of Array"); - if (!is_device || copy_device) { - std::copy(in_data, in_data + dims.elements(), data.get()); - } - } - template - Array::Array(af::dim4 dims, TNJ::Node_ptr n) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), - data(), data_dims(dims), - node(n), offset(0), ready(false), owner(true) - { +const int MAX_TNJ_LEN = 20; +using TNJ::BufferNode; +using TNJ::Node; +using TNJ::Node_ptr; + +using af::dim4; + +template +Array::Array(dim4 dims): + info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + data(memAlloc(dims.elements()), memFree), data_dims(dims), + node(), offset(0), ready(true), owner(true) +{ } + +template +Array::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device): + info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + data((is_device & !copy_device) ? (T*)in_data : memAlloc(dims.elements()), memFree), data_dims(dims), + node(), offset(0), ready(true), owner(true) +{ + static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); + static_assert(offsetof(Array, info) == 0, "Array::info must be the first member variable of Array"); + if (!is_device || copy_device) { + std::copy(in_data, in_data + dims.elements(), data.get()); } +} - template - Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) : - info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits::af_type), - data(parent.getData()), data_dims(parent.getDataDims()), - node(), - offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), - ready(true), owner(false) - { } +template +Array::Array(af::dim4 dims, TNJ::Node_ptr n) : + info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + data(), data_dims(dims), + node(n), offset(0), ready(false), owner(true) +{ +} +template +Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) : + info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits::af_type), + data(parent.getData()), data_dims(parent.getDataDims()), + node(), + offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), + ready(true), owner(false) +{ } - template - void Array::eval() - { - if (isReady()) return; - if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL); - this->setId(getActiveDeviceId()); +template +void Array::eval() +{ + if (isReady()) return; + if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL); - data = std::shared_ptr(memAlloc(elements()), memFree); + this->setId(getActiveDeviceId()); - auto func = [] (Array in) { - in.setId(getActiveDeviceId()); - T *ptr = in.data.get(); + data = std::shared_ptr(memAlloc(elements()), memFree); - dim4 odims = in.dims(); - dim4 ostrs = in.strides(); + auto func = [] (Array in) { + in.setId(getActiveDeviceId()); + T *ptr = in.data.get(); - bool is_linear = in.node->isLinear(odims.get()); + dim4 odims = in.dims(); + dim4 ostrs = in.strides(); - if (is_linear) { - int num = in.elements(); - for (int i = 0; i < num; i++) { - ptr[i] = *(T *)in.node->calc(i); - } - } else { - for (int w = 0; w < (int)odims[3]; w++) { - dim_t offw = w * ostrs[3]; + bool is_linear = in.node->isLinear(odims.get()); + + if (is_linear) { + int num = in.elements(); + for (int i = 0; i < num; i++) { + ptr[i] = *(T *)in.node->calc(i); + } + } else { + for (int w = 0; w < (int)odims[3]; w++) { + dim_t offw = w * ostrs[3]; - for (int z = 0; z < (int)odims[2]; z++) { - dim_t offz = z * ostrs[2] + offw; + for (int z = 0; z < (int)odims[2]; z++) { + dim_t offz = z * ostrs[2] + offw; - for (int y = 0; y < (int)odims[1]; y++) { - dim_t offy = y * ostrs[1] + offz; + for (int y = 0; y < (int)odims[1]; y++) { + dim_t offy = y * ostrs[1] + offz; - for (int x = 0; x < (int)odims[0]; x++) { - dim_t id = x + offy; + for (int x = 0; x < (int)odims[0]; x++) { + dim_t id = x + offy; - ptr[id] = *(T *)in.node->calc(x, y, z, w); - } + ptr[id] = *(T *)in.node->calc(x, y, z, w); } } } } - }; - - getQueue().enqueue(func, *this); + } + }; - ready = true; - Node_ptr prev = node; - prev->reset(); - // FIXME: Replace the current node in any JIT possible trees with the new BufferNode - node.reset(); - } + getQueue().enqueue(func, *this); - template - void Array::eval() const - { - if (isReady()) return; - const_cast *>(this)->eval(); - } + ready = true; + Node_ptr prev = node; + prev->reset(); + // FIXME: Replace the current node in any JIT possible trees with the new BufferNode + node.reset(); +} - template - Node_ptr Array::getNode() const - { - if (!node) { +template +void Array::eval() const +{ + if (isReady()) return; + const_cast *>(this)->eval(); +} - unsigned bytes = this->getDataDims().elements() * sizeof(T); +template +Node_ptr Array::getNode() const +{ + if (!node) { - BufferNode *buf_node = new BufferNode(data, - bytes, - offset, - dims().get(), - strides().get(), - isLinear()); + unsigned bytes = this->getDataDims().elements() * sizeof(T); - const_cast *>(this)->node = Node_ptr(reinterpret_cast(buf_node)); - } + BufferNode *buf_node = new BufferNode(data, + bytes, + offset, + dims().get(), + strides().get(), + isLinear()); - return node; + const_cast *>(this)->node = Node_ptr(reinterpret_cast(buf_node)); } - template - Array - createHostDataArray(const dim4 &size, const T * const data) - { - return Array(size, data, false); - } + return node; +} - template - Array - createDeviceDataArray(const dim4 &size, const void *data) - { - return Array(size, (const T * const) data, true); - } +template +Array +createHostDataArray(const dim4 &size, const T * const data) +{ + return Array(size, data, false); +} - template - Array - createValueArray(const dim4 &size, const T& value) - { - TNJ::ScalarNode *node = new TNJ::ScalarNode(value); - return createNodeArray(size, TNJ::Node_ptr( - reinterpret_cast(node))); - } +template +Array +createDeviceDataArray(const dim4 &size, const void *data) +{ + return Array(size, (const T * const) data, true); +} - template - Array - createEmptyArray(const dim4 &size) - { - return Array(size); - } +template +Array +createValueArray(const dim4 &size, const T& value) +{ + TNJ::ScalarNode *node = new TNJ::ScalarNode(value); + return createNodeArray(size, TNJ::Node_ptr( + reinterpret_cast(node))); +} - template - Array *initArray() { return new Array(dim4(0, 0, 0, 0)); } +template +Array +createEmptyArray(const dim4 &size) +{ + return Array(size); +} +template +Array *initArray() { return new Array(dim4(0, 0, 0, 0)); } - template - Array - createNodeArray(const dim4 &dims, Node_ptr node) - { - Array out = Array(dims, node); - unsigned length =0, buf_count = 0, bytes = 0; +template +Array +createNodeArray(const dim4 &dims, Node_ptr node) +{ + Array out = Array(dims, node); - Node *n = node.get(); - n->getInfo(length, buf_count, bytes); - n->reset(); + unsigned length =0, buf_count = 0, bytes = 0; - if (length > MAX_TNJ_LEN || - buf_count >= MAX_BUFFERS || - bytes >= MAX_BYTES) { - out.eval(); - } + Node *n = node.get(); + n->getInfo(length, buf_count, bytes); + n->reset(); - return out; + if (length > MAX_TNJ_LEN || + buf_count >= MAX_BUFFERS || + bytes >= MAX_BYTES) { + out.eval(); } + return out; +} - template - Array createSubArray(const Array& parent, - const std::vector &index, - bool copy) - { - parent.eval(); - dim4 dDims = parent.getDataDims(); - dim4 pDims = parent.dims(); +template +Array createSubArray(const Array& parent, + const std::vector &index, + bool copy) +{ + parent.eval(); - dim4 dims = toDims (index, pDims); - dim4 offset = toOffset(index, dDims); - dim4 stride = toStride (index, dDims); + dim4 dDims = parent.getDataDims(); + dim4 pDims = parent.dims(); - Array out = Array(parent, dims, offset, stride); + dim4 dims = toDims (index, pDims); + dim4 offset = toOffset(index, dDims); + dim4 stride = toStride (index, dDims); - if (!copy) return out; + Array out = Array(parent, dims, offset, stride); - if (stride[0] != 1 || - stride[1] < 0 || - stride[2] < 0 || - stride[3] < 0) { + if (!copy) return out; - out = copyArray(out); - } + if (stride[0] != 1 || + stride[1] < 0 || + stride[2] < 0 || + stride[3] < 0) { - return out; + out = copyArray(out); } - template - void - destroyArray(Array *A) - { - delete A; - } + return out; +} +template +void +destroyArray(Array *A) +{ + delete A; +} - template - void evalArray(const Array &A) - { - A.eval(); - } - template - void - writeHostDataArray(Array &arr, const T * const data, const size_t bytes) - { - if(!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); - } - memcpy(arr.get() + arr.getOffset(), data, bytes); +template +void evalArray(const Array &A) +{ + A.eval(); +} + +template +void +writeHostDataArray(Array &arr, const T * const data, const size_t bytes) +{ + if(!arr.isOwner()) { + arr = createEmptyArray(arr.dims()); } + memcpy(arr.get() + arr.getOffset(), data, bytes); +} - template - void - writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) - { - if(!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); - } - memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes); +template +void +writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) +{ + if(!arr.isOwner()) { + arr = createEmptyArray(arr.dims()); } + memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes); +} #define INSTANTIATE(T) \ template Array createHostDataArray (const dim4 &size, const T * const data); \ @@ -286,16 +287,17 @@ namespace cpu template void writeHostDataArray (Array &arr, const T * const data, const size_t bytes); \ template void writeDeviceDataArray (Array &arr, const void * const data, const size_t bytes); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(short) +INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp index 4d3c8803ff..7988863d4d 100644 --- a/src/backend/cpu/approx.cpp +++ b/src/backend/cpu/approx.cpp @@ -17,330 +17,339 @@ namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Approx1 - /////////////////////////////////////////////////////////////////////////// - template - struct approx1_op + +/////////////////////////////////////////////////////////////////////////// +// Approx1 +/////////////////////////////////////////////////////////////////////////// +template +struct approx1_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, + const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - return; - } - }; + return; + } +}; - template - struct approx1_op +template +struct approx1_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, + const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idx; - if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - - const Tp x = pos[pmId]; - bool gFlag = false; - if (x < 0 || idims[0] < x+1) { // No need to check y - gFlag = true; - } + dim_t pmId = idx; + if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] - + idy * istrides[1]; - const dim_t iMem = round(x) + ioff; + const Tp x = pos[pmId]; + bool gFlag = false; + if (x < 0 || idims[0] < x+1) { // No need to check y + gFlag = true; + } - out[omId] = in[iMem]; - } + const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + + idy * istrides[1]; + const dim_t iMem = round(x) + ioff; + + out[omId] = in[iMem]; } - }; + } +}; - template - struct approx1_op +template +struct approx1_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, + const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idx; - if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - - const Tp x = pos[pmId]; - bool gFlag = false; - if (x < 0 || idims[0] < x+1) { - gFlag = true; - } + dim_t pmId = idx; + if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - const dim_t grid_x = floor(x); // nearest grid - const Tp off_x = x - grid_x; // fractional offset - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x; - - // Check if x and x + 1 are both valid indices - bool cond = (x < idims[0] - 1); - // Compute Left and Right Weighted Values - Ty yl = ((Tp)1.0 - off_x) * in[ioff]; - Ty yr = cond ? (off_x) * in[ioff + 1] : scalar(0); - Ty yo = yl + yr; - // Compute Weight used - Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x); - // Write final value - out[omId] = (yo / wt); - } + const Tp x = pos[pmId]; + bool gFlag = false; + if (x < 0 || idims[0] < x+1) { + gFlag = true; } - }; - - template - void approx1_(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid) - { - approx1_op op; - bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1); - - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(out, odims, oElems, in, idims, iElems, pos, pdims, - ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w); - } + + const dim_t grid_x = floor(x); // nearest grid + const Tp off_x = x - grid_x; // fractional offset + + const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x; + + // Check if x and x + 1 are both valid indices + bool cond = (x < idims[0] - 1); + // Compute Left and Right Weighted Values + Ty yl = ((Tp)1.0 - off_x) * in[ioff]; + Ty yr = cond ? (off_x) * in[ioff + 1] : scalar(0); + Ty yo = yl + yr; + // Compute Weight used + Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x); + // Write final value + out[omId] = (yo / wt); + } + } +}; + +template +void approx1_(Array output, Array const input, + Array const position, float const offGrid) +{ + Ty * out = output.get(); + Ty const * const in = input.get(); + Tp const * const pos = position.get(); + dim4 const odims = output.dims(); + dim4 const idims = input.dims(); + dim4 const pdims = position.dims(); + dim4 const ostrides = output.strides(); + dim4 const istrides = input.strides(); + dim4 const pstrides = position.strides(); + dim_t const oElems = output.elements(); + dim_t const iElems = input.elements(); + + approx1_op op; + bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1); + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(out, odims, oElems, in, idims, iElems, pos, pdims, + ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w); } } } } +} - template - Array approx1(const Array &in, const Array &pos, - const af_interp_type method, const float offGrid) - { - in.eval(); - pos.eval(); - - af::dim4 odims = in.dims(); - odims[0] = pos.dims()[0]; - - // Create output placeholder - Array out = createEmptyArray(odims); - - switch(method) { - case AF_INTERP_NEAREST: - getQueue().enqueue(approx1_, - out.get(), out.dims(), out.elements(), - in.get(), in.dims(), in.elements(), pos.get(), pos.dims(), - out.strides(), in.strides(), pos.strides(), offGrid); - break; - case AF_INTERP_LINEAR: - getQueue().enqueue(approx1_, - out.get(), out.dims(), out.elements(), - in.get(), in.dims(), in.elements(), pos.get(), pos.dims(), - out.strides(), in.strides(), pos.strides(), offGrid); - break; - default: - break; - } - return out; +template +Array approx1(const Array &in, const Array &pos, + const af_interp_type method, const float offGrid) +{ + in.eval(); + pos.eval(); + + af::dim4 odims = in.dims(); + odims[0] = pos.dims()[0]; + + // Create output placeholder + Array out = createEmptyArray(odims); + + switch(method) { + case AF_INTERP_NEAREST: + getQueue().enqueue(approx1_, + out, in, pos, offGrid); + break; + case AF_INTERP_LINEAR: + getQueue().enqueue(approx1_, + out, in, pos, offGrid); + break; + default: + break; } + return out; +} - /////////////////////////////////////////////////////////////////////////// - // Approx2 - /////////////////////////////////////////////////////////////////////////// - template - struct approx2_op +/////////////////////////////////////////////////////////////////////////// +// Approx2 +/////////////////////////////////////////////////////////////////////////// +template +struct approx2_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const af::dim4 &pstrides, const af::dim4 &qstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - return; - } - }; + return; + } +}; - template - struct approx2_op +template +struct approx2_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const af::dim4 &pstrides, const af::dim4 &qstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idy * pstrides[1] + idx; - dim_t qmId = idy * qstrides[1] + idx; - if(pBatch) { - pmId += idw * pstrides[3] + idz * pstrides[2]; - qmId += idw * qstrides[3] + idz * qstrides[2]; - } + dim_t pmId = idy * pstrides[1] + idx; + dim_t qmId = idy * qstrides[1] + idx; + if(pBatch) { + pmId += idw * pstrides[3] + idz * pstrides[2]; + qmId += idw * qstrides[3] + idz * qstrides[2]; + } - bool gFlag = false; - const Tp x = pos[pmId], y = qos[qmId]; - if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { - gFlag = true; - } + bool gFlag = false; + const Tp x = pos[pmId], y = qos[qmId]; + if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { + gFlag = true; + } - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - const dim_t grid_x = round(x), grid_y = round(y); // nearest grid - const dim_t imId = idw * istrides[3] + idz * istrides[2] + - grid_y * istrides[1] + grid_x; - out[omId] = in[imId]; - } + const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + const dim_t grid_x = round(x), grid_y = round(y); // nearest grid + const dim_t imId = idw * istrides[3] + idz * istrides[2] + + grid_y * istrides[1] + grid_x; + out[omId] = in[imId]; } - }; + } +}; - template - struct approx2_op +template +struct approx2_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const af::dim4 &pstrides, const af::dim4 &qstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idy * pstrides[1] + idx; - dim_t qmId = idy * qstrides[1] + idx; - if(pBatch) { - pmId += idw * pstrides[3] + idz * pstrides[2]; - qmId += idw * qstrides[3] + idz * qstrides[2]; - } + dim_t pmId = idy * pstrides[1] + idx; + dim_t qmId = idy * qstrides[1] + idx; + if(pBatch) { + pmId += idw * pstrides[3] + idz * pstrides[2]; + qmId += idw * qstrides[3] + idz * qstrides[2]; + } - bool gFlag = false; - const Tp x = pos[pmId], y = qos[qmId]; - if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { - gFlag = true; - } + bool gFlag = false; + const Tp x = pos[pmId], y = qos[qmId]; + if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { + gFlag = true; + } - const dim_t grid_x = floor(x), grid_y = floor(y); // nearest grid - const Tp off_x = x - grid_x, off_y = y - grid_y; // fractional offset + const dim_t grid_x = floor(x), grid_y = floor(y); // nearest grid + const Tp off_x = x - grid_x, off_y = y - grid_y; // fractional offset - // Check if pVal and pVal + 1 are both valid indices - bool condY = (y < idims[1] - 1); - bool condX = (x < idims[0] - 1); + // Check if pVal and pVal + 1 are both valid indices + bool condY = (y < idims[1] - 1); + bool condX = (x < idims[0] - 1); - // Compute wieghts used - Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y); - Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0; - Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0; - Tp wt11 = (condX && condY) ? (off_x) * (off_y) : 0; + // Compute wieghts used + Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y); + Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0; + Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0; + Tp wt11 = (condX && condY) ? (off_x) * (off_y) : 0; - Tp wt = wt00 + wt10 + wt01 + wt11; - Ty zero = scalar(0); + Tp wt = wt00 + wt10 + wt01 + wt11; + Ty zero = scalar(0); - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] - + grid_y * istrides[1] + grid_x; + const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + + grid_y * istrides[1] + grid_x; - // Compute Weighted Values - Ty y00 = wt00 * in[ioff]; - Ty y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero; - Ty y01 = (condX) ? wt01 * in[ioff + 1] : zero; - Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero; + // Compute Weighted Values + Ty y00 = wt00 * in[ioff]; + Ty y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero; + Ty y01 = (condX) ? wt01 * in[ioff + 1] : zero; + Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero; - Ty yo = y00 + y10 + y01 + y11; + Ty yo = y00 + y10 + y01 + y11; - // Write Final Value - out[omId] = (yo / wt); - } + // Write Final Value + out[omId] = (yo / wt); } - }; - - template - void approx2_(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid) - { - approx2_op op; - bool pBatch = !(pdims[2] == 1 && pdims[3] == 1); - - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims, - ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w); - } + } +}; + +template +void approx2_(Array output, Array const input, + Array const position, Array const qosition, + float const offGrid) +{ + Ty * out = output.get(); + Ty const * const in = input.get(); + Tp const * const pos = position.get(); + Tp const * const qos = qosition.get(); + dim4 const odims = output.dims(); + dim4 const idims = input.dims(); + dim4 const pdims = position.dims(); + dim4 const qdims = qosition.dims(); + dim4 const ostrides = output.strides(); + dim4 const istrides = input.strides(); + dim4 const pstrides = position.strides(); + dim4 const qstrides = qosition.strides(); + dim_t const oElems = output.elements(); + dim_t const iElems = input.elements(); + + approx2_op op; + bool pBatch = !(pdims[2] == 1 && pdims[3] == 1); + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims, + ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w); } } } } +} - template - Array approx2(const Array &in, const Array &pos0, const Array &pos1, - const af_interp_type method, const float offGrid) - { - in.eval(); - pos0.eval(); - pos1.eval(); - - af::dim4 odims = in.dims(); - odims[0] = pos0.dims()[0]; - odims[1] = pos0.dims()[1]; - - // Create output placeholder - Array out = createEmptyArray(odims); - - switch(method) { - case AF_INTERP_NEAREST: - getQueue().enqueue(approx2_, - out.get(), out.dims(), out.elements(), - in.get(), in.dims(), in.elements(), - pos0.get(), pos0.dims(), pos1.get(), pos1.dims(), - out.strides(), in.strides(), pos0.strides(), pos1.strides(), - offGrid); - break; - case AF_INTERP_LINEAR: - getQueue().enqueue(approx2_, - out.get(), out.dims(), out.elements(), - in.get(), in.dims(), in.elements(), - pos0.get(), pos0.dims(), pos1.get(), pos1.dims(), - out.strides(), in.strides(), pos0.strides(), pos1.strides(), - offGrid); - break; - default: - break; - } - return out; +template +Array approx2(const Array &in, const Array &pos0, const Array &pos1, + const af_interp_type method, const float offGrid) +{ + in.eval(); + pos0.eval(); + pos1.eval(); + + af::dim4 odims = in.dims(); + odims[0] = pos0.dims()[0]; + odims[1] = pos0.dims()[1]; + + Array out = createEmptyArray(odims); + + switch(method) { + case AF_INTERP_NEAREST: + getQueue().enqueue(approx2_, + out, in, pos0, pos1, offGrid); + break; + case AF_INTERP_LINEAR: + getQueue().enqueue(approx2_, + out, in, pos0, pos1, offGrid); + break; + default: + break; } + return out; +} #define INSTANTIATE(Ty, Tp) \ template Array approx1(const Array &in, const Array &pos, \ @@ -349,8 +358,9 @@ namespace cpu const Array &pos1, const af_interp_type method, \ const float offGrid); \ - INSTANTIATE(float , float ) - INSTANTIATE(double , double) - INSTANTIATE(cfloat , float ) - INSTANTIATE(cdouble, double) +INSTANTIATE(float , float ) +INSTANTIATE(double , double) +INSTANTIATE(cfloat , float ) +INSTANTIATE(cdouble, double) + } diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp index 10856f7166..ea38ea7dd7 100644 --- a/src/backend/cpu/bilateral.cpp +++ b/src/backend/cpu/bilateral.cpp @@ -99,6 +99,7 @@ void bilateral_(Array out, const Array in, float s_sigma, float template Array bilateral(const Array &in, const float &s_sigma, const float &c_sigma) { + in.eval(); const dim4 dims = in.dims(); Array out = createEmptyArray(dims); getQueue().enqueue(bilateral_, out, in, s_sigma, c_sigma); diff --git a/src/backend/cpu/cholesky.cpp b/src/backend/cpu/cholesky.cpp index d0bd3c8787..ce11867186 100644 --- a/src/backend/cpu/cholesky.cpp +++ b/src/backend/cpu/cholesky.cpp @@ -47,6 +47,8 @@ CH_FUNC(potrf , cdouble, z) template Array cholesky(int *info, const Array &in, const bool is_upper) { + in.eval(); + Array out = copyArray(in); *info = cholesky_inplace(out, is_upper); @@ -59,6 +61,8 @@ Array cholesky(int *info, const Array &in, const bool is_upper) template int cholesky_inplace(Array &in, const bool is_upper) { + in.eval(); + dim4 iDims = in.dims(); int N = iDims[0]; diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 52403605ca..eef5e0e302 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -23,144 +23,144 @@ namespace cpu { - template - static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim) - { - if(dim == 0) { - if(strides[dim] == 1) { - //FIXME: Check for errors / exceptions - memcpy(dst, src, dims[dim] * sizeof(T)); - } else { - for(dim_t i = 0; i < dims[dim]; i++) { - dst[i] = src[strides[dim]*i]; - } - } + +template +static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim) +{ + if(dim == 0) { + if(strides[dim] == 1) { + //FIXME: Check for errors / exceptions + memcpy(dst, src, dims[dim] * sizeof(T)); } else { - for(dim_t i = dims[dim]; i > 0; i--) { - stridedCopy(dst, ostrides, src, dims, strides, dim - 1); - src += strides[dim]; - dst += ostrides[dim]; + for(dim_t i = 0; i < dims[dim]; i++) { + dst[i] = src[strides[dim]*i]; } } - } - - // Assigns to single elements - template - void copyData(T *to, const Array &from) - { - from.eval(); - getQueue().sync(); - if(from.isOwner()) { - // FIXME: Check for errors / exceptions - memcpy(to, from.get(), from.elements()*sizeof(T)); - } else { - dim4 ostrides = calcStrides(from.dims()); - stridedCopy(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1); + } else { + for(dim_t i = dims[dim]; i > 0; i--) { + stridedCopy(dst, ostrides, src, dims, strides, dim - 1); + src += strides[dim]; + dst += ostrides[dim]; } } +} - template - Array copyArray(const Array &A) - { - Array out = createEmptyArray(A.dims()); - copyData(out.get(), A); - return out; +// Assigns to single elements +template +void copyData(T *to, const Array &from) +{ + from.eval(); + getQueue().sync(); + if(from.isOwner()) { + // FIXME: Check for errors / exceptions + memcpy(to, from.get(), from.elements()*sizeof(T)); + } else { + dim4 ostrides = calcStrides(from.dims()); + stridedCopy(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1); } +} - template - static void copy(Array dst, const Array src, outType default_value, double factor) - { - dim4 src_dims = src.dims(); - dim4 dst_dims = dst.dims(); - dim4 src_strides = src.strides(); - dim4 dst_strides = dst.strides(); +template +Array copyArray(const Array &A) +{ + Array out = createEmptyArray(A.dims()); + copyData(out.get(), A); + return out; +} - const inType * src_ptr = src.get(); - outType * dst_ptr = dst.get(); +template +static void copy(Array dst, const Array src, outType default_value, double factor) +{ + dim4 src_dims = src.dims(); + dim4 dst_dims = dst.dims(); + dim4 src_strides = src.strides(); + dim4 dst_strides = dst.strides(); - dim_t trgt_l = std::min(dst_dims[3], src_dims[3]); - dim_t trgt_k = std::min(dst_dims[2], src_dims[2]); - dim_t trgt_j = std::min(dst_dims[1], src_dims[1]); - dim_t trgt_i = std::min(dst_dims[0], src_dims[0]); + const inType * src_ptr = src.get(); + outType * dst_ptr = dst.get(); - for(dim_t l=0; l - void multiply_inplace(Array &in, double val) - { - in.eval(); - getQueue().enqueue(copy, in, in, 0, val); - } - - template - Array padArray(Array const &in, dim4 const &dims, - outType default_value, double factor) - { - Array ret = createValueArray(dims, default_value); - ret.eval(); - in.eval(); - // FIXME: - getQueue().sync(); - getQueue().enqueue(copy, ret, in, outType(default_value), factor); - return ret; - } +template +void multiply_inplace(Array &in, double val) +{ + in.eval(); + getQueue().enqueue(copy, in, in, 0, val); +} - template - void copyArray(Array &out, Array const &in) - { - out.eval(); - in.eval(); - getQueue().enqueue(copy, out, in, scalar(0), 1.0); - } +template +Array padArray(Array const &in, dim4 const &dims, + outType default_value, double factor) +{ + Array ret = createValueArray(dims, default_value); + ret.eval(); + in.eval(); + // FIXME: + getQueue().sync(); + getQueue().enqueue(copy, ret, in, outType(default_value), factor); + return ret; +} +template +void copyArray(Array &out, Array const &in) +{ + out.eval(); + in.eval(); + getQueue().enqueue(copy, out, in, scalar(0), 1.0); +} #define INSTANTIATE(T) \ template void copyData (T *data, const Array &from); \ template Array copyArray(const Array &A); \ template void multiply_inplace (Array &in, double norm); \ - INSTANTIATE(float ) - INSTANTIATE(double ) - INSTANTIATE(cfloat ) - INSTANTIATE(cdouble) - INSTANTIATE(int ) - INSTANTIATE(uint ) - INSTANTIATE(uchar ) - INSTANTIATE(char ) - INSTANTIATE(intl ) - INSTANTIATE(uintl ) - INSTANTIATE(short ) - INSTANTIATE(ushort ) +INSTANTIATE(float ) +INSTANTIATE(double ) +INSTANTIATE(cfloat ) +INSTANTIATE(cdouble) +INSTANTIATE(int ) +INSTANTIATE(uint ) +INSTANTIATE(uchar ) +INSTANTIATE(char ) +INSTANTIATE(intl ) +INSTANTIATE(uintl ) +INSTANTIATE(short ) +INSTANTIATE(ushort ) #define INSTANTIATE_PAD_ARRAY(SRC_T) \ @@ -189,16 +189,16 @@ namespace cpu template void copyArray(Array &dst, Array const &src); \ template void copyArray(Array &dst, Array const &src); - INSTANTIATE_PAD_ARRAY(float ) - INSTANTIATE_PAD_ARRAY(double) - INSTANTIATE_PAD_ARRAY(int ) - INSTANTIATE_PAD_ARRAY(uint ) - INSTANTIATE_PAD_ARRAY(intl ) - INSTANTIATE_PAD_ARRAY(uintl ) - INSTANTIATE_PAD_ARRAY(uchar ) - INSTANTIATE_PAD_ARRAY(char ) - INSTANTIATE_PAD_ARRAY(ushort) - INSTANTIATE_PAD_ARRAY(short ) +INSTANTIATE_PAD_ARRAY(float ) +INSTANTIATE_PAD_ARRAY(double) +INSTANTIATE_PAD_ARRAY(int ) +INSTANTIATE_PAD_ARRAY(uint ) +INSTANTIATE_PAD_ARRAY(intl ) +INSTANTIATE_PAD_ARRAY(uintl ) +INSTANTIATE_PAD_ARRAY(uchar ) +INSTANTIATE_PAD_ARRAY(char ) +INSTANTIATE_PAD_ARRAY(ushort) +INSTANTIATE_PAD_ARRAY(short ) #define INSTANTIATE_PAD_ARRAY_COMPLEX(SRC_T) \ template Array padArray(Array const &src, dim4 const &dims, cfloat default_value, double factor); \ @@ -206,8 +206,8 @@ namespace cpu template void copyArray(Array &dst, Array const &src); \ template void copyArray(Array &dst, Array const &src); - INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat ) - INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble) +INSTANTIATE_PAD_ARRAY_COMPLEX(cfloat ) +INSTANTIATE_PAD_ARRAY_COMPLEX(cdouble) #define SPECILIAZE_UNUSED_COPYARRAY(SRC_T, DST_T) \ template<> void copyArray(Array &out, Array const &in) \ @@ -215,25 +215,25 @@ namespace cpu CPU_NOT_SUPPORTED();\ } - SPECILIAZE_UNUSED_COPYARRAY(cfloat , double) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , float) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , char) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , int) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , short) - SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, double) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, float) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, char) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, int) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, short) - SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , double) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , float) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , uchar) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , char) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , uint) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , int) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , intl) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , uintl) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , short) +SPECILIAZE_UNUSED_COPYARRAY(cfloat , ushort) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, double) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, float) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, uchar) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, char) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, uint) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, int) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, intl) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, uintl) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, short) +SPECILIAZE_UNUSED_COPYARRAY(cdouble, ushort) } diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp index 856ed6ed44..9af78459c1 100644 --- a/src/backend/cpu/diagonal.cpp +++ b/src/backend/cpu/diagonal.cpp @@ -20,87 +20,88 @@ namespace cpu { - template - Array diagCreate(const Array &in, const int num) - { - in.eval(); - - int size = in.dims()[0] + std::abs(num); - int batch = in.dims()[1]; - Array out = createEmptyArray(dim4(size, size, batch)); - - auto func = [=] (Array out, const Array in) { - const T *iptr = in.get(); - T *optr = out.get(); - - for (int k = 0; k < batch; k++) { - for (int j = 0; j < size; j++) { - for (int i = 0; i < size; i++) { - T val = scalar(0); - if (i == j - num) { - val = (num > 0) ? iptr[i] : iptr[j]; - } - optr[i + j * out.strides()[1]] = val; + +template +Array diagCreate(const Array &in, const int num) +{ + in.eval(); + + int size = in.dims()[0] + std::abs(num); + int batch = in.dims()[1]; + Array out = createEmptyArray(dim4(size, size, batch)); + + auto func = [=] (Array out, const Array in) { + const T *iptr = in.get(); + T *optr = out.get(); + + for (int k = 0; k < batch; k++) { + for (int j = 0; j < size; j++) { + for (int i = 0; i < size; i++) { + T val = scalar(0); + if (i == j - num) { + val = (num > 0) ? iptr[i] : iptr[j]; } + optr[i + j * out.strides()[1]] = val; } - optr += out.strides()[2]; - iptr += in.strides()[1]; } - }; - getQueue().enqueue(func, out, in); + optr += out.strides()[2]; + iptr += in.strides()[1]; + } + }; + getQueue().enqueue(func, out, in); - return out; - } + return out; +} - template - Array diagExtract(const Array &in, const int num) - { - in.eval(); +template +Array diagExtract(const Array &in, const int num) +{ + in.eval(); - const dim4 idims = in.dims(); - dim_t size = std::max(idims[0], idims[1]) - std::abs(num); - Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); + const dim4 idims = in.dims(); + dim_t size = std::max(idims[0], idims[1]) - std::abs(num); + Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); - auto func = [=] (Array out, const Array in) { - const dim4 odims = out.dims(); + auto func = [=] (Array out, const Array in) { + const dim4 odims = out.dims(); - const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); + const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); - for (int l = 0; l < (int)odims[3]; l++) { + for (int l = 0; l < (int)odims[3]; l++) { - for (int k = 0; k < (int)odims[2]; k++) { - const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; - T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; + for (int k = 0; k < (int)odims[2]; k++) { + const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; + T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; - for (int i = 0; i < (int)odims[0]; i++) { - T val = scalar(0); - if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; - optr[i] = val; - } + for (int i = 0; i < (int)odims[0]; i++) { + T val = scalar(0); + if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; + optr[i] = val; } } - }; + } + }; - getQueue().enqueue(func, out, in); + getQueue().enqueue(func, out, in); - return out; - } + return out; +} #define INSTANTIATE_DIAGONAL(T) \ template Array diagExtract (const Array &in, const int num); \ template Array diagCreate (const Array &in, const int num); - INSTANTIATE_DIAGONAL(float) - INSTANTIATE_DIAGONAL(double) - INSTANTIATE_DIAGONAL(cfloat) - INSTANTIATE_DIAGONAL(cdouble) - INSTANTIATE_DIAGONAL(int) - INSTANTIATE_DIAGONAL(uint) - INSTANTIATE_DIAGONAL(intl) - INSTANTIATE_DIAGONAL(uintl) - INSTANTIATE_DIAGONAL(char) - INSTANTIATE_DIAGONAL(uchar) - INSTANTIATE_DIAGONAL(short) - INSTANTIATE_DIAGONAL(ushort) +INSTANTIATE_DIAGONAL(float) +INSTANTIATE_DIAGONAL(double) +INSTANTIATE_DIAGONAL(cfloat) +INSTANTIATE_DIAGONAL(cdouble) +INSTANTIATE_DIAGONAL(int) +INSTANTIATE_DIAGONAL(uint) +INSTANTIATE_DIAGONAL(intl) +INSTANTIATE_DIAGONAL(uintl) +INSTANTIATE_DIAGONAL(char) +INSTANTIATE_DIAGONAL(uchar) +INSTANTIATE_DIAGONAL(short) +INSTANTIATE_DIAGONAL(ushort) } diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp index 321dae7b85..8f9c0f13be 100644 --- a/src/backend/cpu/diff.cpp +++ b/src/backend/cpu/diff.cpp @@ -16,119 +16,122 @@ namespace cpu { - unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0) - { - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i); - } - - template - Array diff1(const Array &in, const int dim) - { - // Bool for dimension - bool is_dim0 = dim == 0; - bool is_dim1 = dim == 1; - bool is_dim2 = dim == 2; - bool is_dim3 = dim == 3; - - // Decrement dimension of select dimension - af::dim4 dims = in.dims(); - dims[dim]--; - - // Create output placeholder - Array outArray = createEmptyArray(dims); - - auto func = [=] (Array outArray, Array in) { - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - // TODO: Improve this - for(dim_t l = 0; l < dims[3]; l++) { - for(dim_t k = 0; k < dims[2]; k++) { - for(dim_t j = 0; j < dims[1]; j++) { - for(dim_t i = 0; i < dims[0]; i++) { - // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), - i + is_dim0, j + is_dim1, - k + is_dim2, l + is_dim3); - int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); - outPtr[odx] = inPtr[jdx] - inPtr[idx]; - } + +unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0) +{ + return (l * strides[3] + + k * strides[2] + + j * strides[1] + + i); +} + +template +Array diff1(const Array &in, const int dim) +{ + in.eval(); + // Bool for dimension + bool is_dim0 = dim == 0; + bool is_dim1 = dim == 1; + bool is_dim2 = dim == 2; + bool is_dim3 = dim == 3; + + // Decrement dimension of select dimension + af::dim4 dims = in.dims(); + dims[dim]--; + + // Create output placeholder + Array outArray = createEmptyArray(dims); + + auto func = [=] (Array outArray, Array in) { + // Get pointers to raw data + const T *inPtr = in.get(); + T *outPtr = outArray.get(); + + // TODO: Improve this + for(dim_t l = 0; l < dims[3]; l++) { + for(dim_t k = 0; k < dims[2]; k++) { + for(dim_t j = 0; j < dims[1]; j++) { + for(dim_t i = 0; i < dims[0]; i++) { + // Operation: out[index] = in[index + 1 * dim_size] - in[index] + int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); + int jdx = getIdx(in.strides(), in.offsets(), + i + is_dim0, j + is_dim1, + k + is_dim2, l + is_dim3); + int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); + outPtr[odx] = inPtr[jdx] - inPtr[idx]; } } } - }; - getQueue().enqueue(func, outArray, in); - - return outArray; - } - - template - Array diff2(const Array &in, const int dim) - { - // Bool for dimension - bool is_dim0 = dim == 0; - bool is_dim1 = dim == 1; - bool is_dim2 = dim == 2; - bool is_dim3 = dim == 3; - - // Decrement dimension of select dimension - af::dim4 dims = in.dims(); - dims[dim] -= 2; - - // Create output placeholder - Array outArray = createEmptyArray(dims); - - auto func = [=] (Array outArray, Array in) { - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - // TODO: Improve this - for(dim_t l = 0; l < dims[3]; l++) { - for(dim_t k = 0; k < dims[2]; k++) { - for(dim_t j = 0; j < dims[1]; j++) { - for(dim_t i = 0; i < dims[0]; i++) { - // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), - i + is_dim0, j + is_dim1, - k + is_dim2, l + is_dim3); - int kdx = getIdx(in.strides(), in.offsets(), - i + 2 * is_dim0, j + 2 * is_dim1, - k + 2 * is_dim2, l + 2 * is_dim3); - int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); - outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; - } + } + }; + getQueue().enqueue(func, outArray, in); + + return outArray; +} + +template +Array diff2(const Array &in, const int dim) +{ + in.eval(); + // Bool for dimension + bool is_dim0 = dim == 0; + bool is_dim1 = dim == 1; + bool is_dim2 = dim == 2; + bool is_dim3 = dim == 3; + + // Decrement dimension of select dimension + af::dim4 dims = in.dims(); + dims[dim] -= 2; + + // Create output placeholder + Array outArray = createEmptyArray(dims); + + auto func = [=] (Array outArray, Array in) { + // Get pointers to raw data + const T *inPtr = in.get(); + T *outPtr = outArray.get(); + + // TODO: Improve this + for(dim_t l = 0; l < dims[3]; l++) { + for(dim_t k = 0; k < dims[2]; k++) { + for(dim_t j = 0; j < dims[1]; j++) { + for(dim_t i = 0; i < dims[0]; i++) { + // Operation: out[index] = in[index + 1 * dim_size] - in[index] + int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); + int jdx = getIdx(in.strides(), in.offsets(), + i + is_dim0, j + is_dim1, + k + is_dim2, l + is_dim3); + int kdx = getIdx(in.strides(), in.offsets(), + i + 2 * is_dim0, j + 2 * is_dim1, + k + 2 * is_dim2, l + 2 * is_dim3); + int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); + outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; } } } - }; + } + }; - getQueue().enqueue(func, outArray, in); + getQueue().enqueue(func, outArray, in); - return outArray; - } + return outArray; +} #define INSTANTIATE(T) \ template Array diff1 (const Array &in, const int dim); \ template Array diff2 (const Array &in, const int dim); \ +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(ushort) +INSTANTIATE(short) - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(ushort) - INSTANTIATE(short) } diff --git a/src/backend/cpu/exampleFunction.cpp b/src/backend/cpu/exampleFunction.cpp index a9e7bca9eb..d45b8a28ec 100644 --- a/src/backend/cpu/exampleFunction.cpp +++ b/src/backend/cpu/exampleFunction.cpp @@ -24,6 +24,13 @@ namespace cpu template Array exampleFunction(const Array &in, const af_someenum_t method) { + in.eval(); // All input Arrays should call eval mandatorily + // in CPU backend function implementations. Since + // the cpu fns are asynchronous launches, any Arrays + // that are either views/JIT nodes needs to evaluated + // before they are passed onto functions that are + // enqueued onto the queues. + dim4 outputDims; // this should be '= in.dims();' in most cases // but would definitely depend on the type of // algorithm you are implementing. diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp index 7262e6dd78..e522954cfe 100644 --- a/src/backend/cpu/fft.cpp +++ b/src/backend/cpu/fft.cpp @@ -95,6 +95,7 @@ void fft_inplace_(Array in) template void fft_inplace(Array &in) { + in.eval(); getQueue().enqueue(fft_inplace_, in); } @@ -165,6 +166,8 @@ void fft_r2c_(Array out, const Array in) template Array fft_r2c(const Array &in) { + in.eval(); + dim4 odims = in.dims(); odims[0] = odims[0] / 2 + 1; Array out = createEmptyArray(odims); @@ -216,6 +219,8 @@ void fft_c2r_(Array out, const Array in, const dim4 odims) template Array fft_c2r(const Array &in, const dim4 &odims) { + in.eval(); + Array out = createEmptyArray(odims); getQueue().enqueue(fft_c2r_, out, in, odims); @@ -243,4 +248,5 @@ Array fft_c2r(const Array &in, const dim4 &odims) INSTANTIATE_REAL(float , cfloat ) INSTANTIATE_REAL(double, cdouble) + } diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp index 504c02a29c..06c15cff4e 100644 --- a/src/backend/cpu/gradient.cpp +++ b/src/backend/cpu/gradient.cpp @@ -101,4 +101,5 @@ INSTANTIATE(float) INSTANTIATE(double) INSTANTIATE(cfloat) INSTANTIATE(cdouble) + } diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp index f7236bd68d..55c441755c 100644 --- a/src/backend/cpu/identity.cpp +++ b/src/backend/cpu/identity.cpp @@ -18,6 +18,7 @@ namespace cpu { + template Array identity(const dim4& dims) { @@ -56,4 +57,5 @@ INSTANTIATE_IDENTITY(char) INSTANTIATE_IDENTITY(uchar) INSTANTIATE_IDENTITY(short) INSTANTIATE_IDENTITY(ushort) + } diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp index 170b6a1570..dcb85fa787 100644 --- a/src/backend/cpu/iota.cpp +++ b/src/backend/cpu/iota.cpp @@ -21,6 +21,7 @@ using namespace std; namespace cpu { + /////////////////////////////////////////////////////////////////////////// // Kernel Functions /////////////////////////////////////////////////////////////////////////// @@ -76,4 +77,5 @@ INSTANTIATE(uintl) INSTANTIATE(uchar) INSTANTIATE(short) INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp index e562bae068..9858cba665 100644 --- a/src/backend/cpu/ireduce.cpp +++ b/src/backend/cpu/ireduce.cpp @@ -21,178 +21,180 @@ using af::dim4; namespace cpu { - template double cabs(const T in) { return (double)in; } - static double cabs(const char in) { return (double)(in > 0); } - static double cabs(const cfloat &in) { return (double)abs(in); } - static double cabs(const cdouble &in) { return (double)abs(in); } - template - struct MinMaxOp +template double cabs(const T in) { return (double)in; } +static double cabs(const char in) { return (double)(in > 0); } +static double cabs(const cfloat &in) { return (double)abs(in); } +static double cabs(const cdouble &in) { return (double)abs(in); } + +template +struct MinMaxOp +{ + T m_val; + uint m_idx; + MinMaxOp(T val, uint idx) : + m_val(val), m_idx(idx) { - T m_val; - uint m_idx; - MinMaxOp(T val, uint idx) : - m_val(val), m_idx(idx) - { - } + } - void operator()(T val, uint idx) - { - if (cabs(val) < cabs(m_val) || - (cabs(val) == cabs(m_val) && - idx > m_idx)) { - m_val = val; - m_idx = idx; - } + void operator()(T val, uint idx) + { + if (cabs(val) < cabs(m_val) || + (cabs(val) == cabs(m_val) && + idx > m_idx)) { + m_val = val; + m_idx = idx; } - }; + } +}; - template - struct MinMaxOp +template +struct MinMaxOp +{ + T m_val; + uint m_idx; + MinMaxOp(T val, uint idx) : + m_val(val), m_idx(idx) { - T m_val; - uint m_idx; - MinMaxOp(T val, uint idx) : - m_val(val), m_idx(idx) - { - } + } - void operator()(T val, uint idx) - { - if (cabs(val) > cabs(m_val) || - (cabs(val) == cabs(m_val) && - idx <= m_idx)) { - m_val = val; - m_idx = idx; - } + void operator()(T val, uint idx) + { + if (cabs(val) > cabs(m_val) || + (cabs(val) == cabs(m_val) && + idx <= m_idx)) { + m_val = val; + m_idx = idx; } - }; + } +}; - template - struct ireduce_dim +template +struct ireduce_dim +{ + void operator()(Array output, Array locArray, const dim_t outOffset, + const Array input, const dim_t inOffset, const int dim) { - void operator()(Array output, Array locArray, const dim_t outOffset, - const Array input, const dim_t inOffset, const int dim) - { - const dim4 odims = output.dims(); - const dim4 ostrides = output.strides(); - const dim4 istrides = input.strides(); - const int D1 = D - 1; - for (dim_t i = 0; i < odims[D1]; i++) { - ireduce_dim()(output, locArray, outOffset + i * ostrides[D1], - input, inOffset + i * istrides[D1], dim); - } + const dim4 odims = output.dims(); + const dim4 ostrides = output.strides(); + const dim4 istrides = input.strides(); + const int D1 = D - 1; + for (dim_t i = 0; i < odims[D1]; i++) { + ireduce_dim()(output, locArray, outOffset + i * ostrides[D1], + input, inOffset + i * istrides[D1], dim); } - }; + } +}; - template - struct ireduce_dim +template +struct ireduce_dim +{ + void operator()(Array output, Array locArray, const dim_t outOffset, + const Array input, const dim_t inOffset, const int dim) { - void operator()(Array output, Array locArray, const dim_t outOffset, - const Array input, const dim_t inOffset, const int dim) - { - const dim4 idims = input.dims(); - const dim4 istrides = input.strides(); - - T const * const in = input.get(); - T * out = output.get(); - uint * loc = locArray.get(); - - dim_t stride = istrides[dim]; - MinMaxOp Op(in[0], 0); - for (dim_t i = 0; i < idims[dim]; i++) { - Op(in[inOffset + i * stride], i); - } + const dim4 idims = input.dims(); + const dim4 istrides = input.strides(); - *(out+outOffset) = Op.m_val; - *(loc+outOffset) = Op.m_idx; - } - }; + T const * const in = input.get(); + T * out = output.get(); + uint * loc = locArray.get(); - template - using ireduce_dim_func = std::function, Array, const dim_t, - const Array, const dim_t, const int)>; + dim_t stride = istrides[dim]; + MinMaxOp Op(in[0], 0); + for (dim_t i = 0; i < idims[dim]; i++) { + Op(in[inOffset + i * stride], i); + } - template - void ireduce(Array &out, Array &loc, const Array &in, const int dim) - { - out.eval(); - loc.eval(); - in.eval(); - - dim4 odims = in.dims(); - odims[dim] = 1; - static const ireduce_dim_func ireduce_funcs[] = { ireduce_dim() - , ireduce_dim() - , ireduce_dim() - , ireduce_dim()}; - - getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); + *(out+outOffset) = Op.m_val; + *(loc+outOffset) = Op.m_idx; } +}; - template - T ireduce_all(unsigned *loc, const Array &in) - { - in.eval(); - getQueue().sync(); +template +using ireduce_dim_func = std::function, Array, const dim_t, + const Array, const dim_t, const int)>; - af::dim4 dims = in.dims(); - af::dim4 strides = in.strides(); - const T *inPtr = in.get(); +template +void ireduce(Array &out, Array &loc, const Array &in, const int dim) +{ + out.eval(); + loc.eval(); + in.eval(); + + dim4 odims = in.dims(); + odims[dim] = 1; + static const ireduce_dim_func ireduce_funcs[] = { ireduce_dim() + , ireduce_dim() + , ireduce_dim() + , ireduce_dim()}; + + getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); +} - MinMaxOp Op(inPtr[0], 0); +template +T ireduce_all(unsigned *loc, const Array &in) +{ + in.eval(); + getQueue().sync(); + + af::dim4 dims = in.dims(); + af::dim4 strides = in.strides(); + const T *inPtr = in.get(); - for(dim_t l = 0; l < dims[3]; l++) { - dim_t off3 = l * strides[3]; + MinMaxOp Op(inPtr[0], 0); - for(dim_t k = 0; k < dims[2]; k++) { - dim_t off2 = k * strides[2]; + for(dim_t l = 0; l < dims[3]; l++) { + dim_t off3 = l * strides[3]; - for(dim_t j = 0; j < dims[1]; j++) { - dim_t off1 = j * strides[1]; + for(dim_t k = 0; k < dims[2]; k++) { + dim_t off2 = k * strides[2]; - for(dim_t i = 0; i < dims[0]; i++) { - dim_t idx = i + off1 + off2 + off3; - Op(inPtr[idx], idx); - } + for(dim_t j = 0; j < dims[1]; j++) { + dim_t off1 = j * strides[1]; + + for(dim_t i = 0; i < dims[0]; i++) { + dim_t idx = i + off1 + off2 + off3; + Op(inPtr[idx], idx); } } } - - *loc = Op.m_idx; - return Op.m_val; } + *loc = Op.m_idx; + return Op.m_val; +} + #define INSTANTIATE(ROp, T) \ template void ireduce(Array &out, Array &loc, \ const Array &in, const int dim); \ template T ireduce_all(unsigned *loc, const Array &in); \ - //min - INSTANTIATE(af_min_t, float ) - INSTANTIATE(af_min_t, double ) - INSTANTIATE(af_min_t, cfloat ) - INSTANTIATE(af_min_t, cdouble) - INSTANTIATE(af_min_t, int ) - INSTANTIATE(af_min_t, uint ) - INSTANTIATE(af_min_t, intl ) - INSTANTIATE(af_min_t, uintl ) - INSTANTIATE(af_min_t, char ) - INSTANTIATE(af_min_t, uchar ) - INSTANTIATE(af_min_t, short ) - INSTANTIATE(af_min_t, ushort ) - - //max - INSTANTIATE(af_max_t, float ) - INSTANTIATE(af_max_t, double ) - INSTANTIATE(af_max_t, cfloat ) - INSTANTIATE(af_max_t, cdouble) - INSTANTIATE(af_max_t, int ) - INSTANTIATE(af_max_t, uint ) - INSTANTIATE(af_max_t, intl ) - INSTANTIATE(af_max_t, uintl ) - INSTANTIATE(af_max_t, char ) - INSTANTIATE(af_max_t, uchar ) - INSTANTIATE(af_max_t, short ) - INSTANTIATE(af_max_t, ushort ) +//min +INSTANTIATE(af_min_t, float ) +INSTANTIATE(af_min_t, double ) +INSTANTIATE(af_min_t, cfloat ) +INSTANTIATE(af_min_t, cdouble) +INSTANTIATE(af_min_t, int ) +INSTANTIATE(af_min_t, uint ) +INSTANTIATE(af_min_t, intl ) +INSTANTIATE(af_min_t, uintl ) +INSTANTIATE(af_min_t, char ) +INSTANTIATE(af_min_t, uchar ) +INSTANTIATE(af_min_t, short ) +INSTANTIATE(af_min_t, ushort ) + +//max +INSTANTIATE(af_max_t, float ) +INSTANTIATE(af_max_t, double ) +INSTANTIATE(af_max_t, cfloat ) +INSTANTIATE(af_max_t, cdouble) +INSTANTIATE(af_max_t, int ) +INSTANTIATE(af_max_t, uint ) +INSTANTIATE(af_max_t, intl ) +INSTANTIATE(af_max_t, uintl ) +INSTANTIATE(af_max_t, char ) +INSTANTIATE(af_max_t, uchar ) +INSTANTIATE(af_max_t, short ) +INSTANTIATE(af_max_t, ushort ) + } diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index ff0be438ee..9a046139d4 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -104,6 +104,8 @@ void convertPivot(Array p, Array pivot) template void lu(Array &lower, Array &upper, Array &pivot, const Array &in) { + in.eval(); + dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; @@ -123,6 +125,8 @@ void lu(Array &lower, Array &upper, Array &pivot, const Array &in) template Array lu_inplace(Array &in, const bool convert_pivot) { + in.eval(); + dim4 iDims = in.dims(); Array pivot = createEmptyArray(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1)); @@ -166,6 +170,7 @@ Array lu_inplace(Array &in, const bool convert_pivot) namespace cpu { + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -174,4 +179,5 @@ INSTANTIATE_LU(float) INSTANTIATE_LU(cfloat) INSTANTIATE_LU(double) INSTANTIATE_LU(cdouble) + } diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp index 02a4888864..d4ce95a691 100644 --- a/src/backend/cpu/match_template.cpp +++ b/src/backend/cpu/match_template.cpp @@ -24,6 +24,9 @@ namespace cpu template Array match_template(const Array &sImg, const Array &tImg) { + sImg.eval(); + tImg.eval(); + Array out = createEmptyArray(sImg.dims()); auto func = [=](Array out, const Array sImg, const Array tImg) { diff --git a/src/backend/cpu/math.cpp b/src/backend/cpu/math.cpp index 5a6bcbc67e..e00fd78fcd 100644 --- a/src/backend/cpu/math.cpp +++ b/src/backend/cpu/math.cpp @@ -11,39 +11,41 @@ namespace cpu { - uint abs(uint val) { return val; } - uchar abs(uchar val) { return val; } - uintl abs(uintl val) { return val; } - - cfloat scalar(float val) - { - cfloat cval = {(float)val, 0}; - return cval; - } - - cdouble scalar(double val) - { - cdouble cval = {val, 0}; - return cval; - } - - cfloat min(cfloat lhs, cfloat rhs) - { - return abs(lhs) < abs(rhs) ? lhs : rhs; - } - - cdouble min(cdouble lhs, cdouble rhs) - { - return abs(lhs) < abs(rhs) ? lhs : rhs; - } - - cfloat max(cfloat lhs, cfloat rhs) - { - return abs(lhs) > abs(rhs) ? lhs : rhs; - } - - cdouble max(cdouble lhs, cdouble rhs) - { - return abs(lhs) > abs(rhs) ? lhs : rhs; - } + +uint abs(uint val) { return val; } +uchar abs(uchar val) { return val; } +uintl abs(uintl val) { return val; } + +cfloat scalar(float val) +{ + cfloat cval = {(float)val, 0}; + return cval; +} + +cdouble scalar(double val) +{ + cdouble cval = {val, 0}; + return cval; +} + +cfloat min(cfloat lhs, cfloat rhs) +{ + return abs(lhs) < abs(rhs) ? lhs : rhs; +} + +cdouble min(cdouble lhs, cdouble rhs) +{ + return abs(lhs) < abs(rhs) ? lhs : rhs; +} + +cfloat max(cfloat lhs, cfloat rhs) +{ + return abs(lhs) > abs(rhs) ? lhs : rhs; +} + +cdouble max(cdouble lhs, cdouble rhs) +{ + return abs(lhs) > abs(rhs) ? lhs : rhs; +} + } diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp index 3f99d15b0c..62b80e010e 100644 --- a/src/backend/cpu/meanshift.cpp +++ b/src/backend/cpu/meanshift.cpp @@ -33,6 +33,8 @@ inline dim_t clamp(dim_t a, dim_t mn, dim_t mx) template Array meanshift(const Array &in, const float &s_sigma, const float &c_sigma, const unsigned iter) { + in.eval(); + Array out = createEmptyArray(in.dims()); auto func = [=] (Array out, const Array in, const float s_sigma, diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp index ce921fc3b5..4e74a55fd2 100644 --- a/src/backend/cpu/medfilt.cpp +++ b/src/backend/cpu/medfilt.cpp @@ -25,6 +25,8 @@ namespace cpu template Array medfilt(const Array &in, dim_t w_len, dim_t w_wid) { + in.eval(); + Array out = createEmptyArray(in.dims()); auto func = [=] (Array out, const Array in, diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 73120b9171..e11f994eef 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -20,211 +20,211 @@ namespace cpu { - static size_t memory_resolution = 1024; //1KB +static size_t memory_resolution = 1024; //1KB - void setMemStepSize(size_t step_bytes) - { - memory_resolution = step_bytes; - } +void setMemStepSize(size_t step_bytes) +{ + memory_resolution = step_bytes; +} + +size_t getMemStepSize(void) +{ + return memory_resolution; +} - size_t getMemStepSize(void) +class Manager +{ + public: + static bool initialized; + Manager() { - return memory_resolution; + initialized = true; } - class Manager + ~Manager() { - public: - static bool initialized; - Manager() - { - initialized = true; - } - - ~Manager() - { - garbageCollect(); - } - }; + garbageCollect(); + } +}; - bool Manager::initialized = false; +bool Manager::initialized = false; - static void managerInit() - { - if(Manager::initialized == false) - static Manager pm = Manager(); - } +static void managerInit() +{ + if(Manager::initialized == false) + static Manager pm = Manager(); +} - typedef struct - { - bool is_free; - bool is_unlinked; - size_t bytes; - } mem_info; - - static size_t used_bytes = 0; - static size_t used_buffers = 0; - static size_t total_bytes = 0; - typedef std::map mem_t; - typedef mem_t::iterator mem_iter; - - mem_t memory_map; - std::mutex memory_map_mutex; - - template - void freeWrapper(T *ptr) - { - free((void *)ptr); - } +typedef struct +{ + bool is_free; + bool is_unlinked; + size_t bytes; +} mem_info; + +static size_t used_bytes = 0; +static size_t used_buffers = 0; +static size_t total_bytes = 0; +typedef std::map mem_t; +typedef mem_t::iterator mem_iter; + +mem_t memory_map; +std::mutex memory_map_mutex; + +template +void freeWrapper(T *ptr) +{ + free((void *)ptr); +} - void garbageCollect() - { - for(mem_iter iter = memory_map.begin(); - iter != memory_map.end(); ++iter) { +void garbageCollect() +{ + for(mem_iter iter = memory_map.begin(); + iter != memory_map.end(); ++iter) { - if ((iter->second).is_free) { + if ((iter->second).is_free) { - if (!(iter->second).is_unlinked) { - freeWrapper(iter->first); - total_bytes -= iter->second.bytes; - } + if (!(iter->second).is_unlinked) { + freeWrapper(iter->first); + total_bytes -= iter->second.bytes; } } + } - mem_iter memory_curr = memory_map.begin(); - mem_iter memory_end = memory_map.end(); + mem_iter memory_curr = memory_map.begin(); + mem_iter memory_end = memory_map.end(); - while(memory_curr != memory_end) { - if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) { - memory_map.erase(memory_curr++); - } else { - ++memory_curr; - } + while(memory_curr != memory_end) { + if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) { + memory_map.erase(memory_curr++); + } else { + ++memory_curr; } } +} - template - T* memAlloc(const size_t &elements) - { - managerInit(); +template +T* memAlloc(const size_t &elements) +{ + managerInit(); - T* ptr = NULL; - size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution; + T* ptr = NULL; + size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution; - if (elements > 0) { - std::lock_guard lock(memory_map_mutex); + if (elements > 0) { + std::lock_guard lock(memory_map_mutex); - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (memory_map.size() > MAX_BUFFERS || - used_bytes >= MAX_BYTES) { + // FIXME: Add better checks for garbage collection + // Perhaps look at total memory available as a metric + if (memory_map.size() > MAX_BUFFERS || + used_bytes >= MAX_BYTES) { - garbageCollect(); - } + garbageCollect(); + } - for(mem_iter iter = memory_map.begin(); - iter != memory_map.end(); ++iter) { + for(mem_iter iter = memory_map.begin(); + iter != memory_map.end(); ++iter) { - mem_info info = iter->second; + mem_info info = iter->second; - if ( info.is_free && - !info.is_unlinked && - info.bytes == alloc_bytes) { + if ( info.is_free && + !info.is_unlinked && + info.bytes == alloc_bytes) { - iter->second.is_free = false; - used_bytes += alloc_bytes; - used_buffers++; - return (T *)iter->first; - } + iter->second.is_free = false; + used_bytes += alloc_bytes; + used_buffers++; + return (T *)iter->first; } + } - // Perform garbage collection if memory can not be allocated - ptr = (T *)malloc(alloc_bytes); + // Perform garbage collection if memory can not be allocated + ptr = (T *)malloc(alloc_bytes); - if (ptr == NULL) { - AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM); - } + if (ptr == NULL) { + AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM); + } - mem_info info = {false, false, alloc_bytes}; - memory_map[ptr] = info; + mem_info info = {false, false, alloc_bytes}; + memory_map[ptr] = info; - used_bytes += alloc_bytes; - used_buffers++; - total_bytes += alloc_bytes; - } - return ptr; + used_bytes += alloc_bytes; + used_buffers++; + total_bytes += alloc_bytes; } + return ptr; +} - template - void memFree(T *ptr) - { - std::lock_guard lock(memory_map_mutex); +template +void memFree(T *ptr) +{ + std::lock_guard lock(memory_map_mutex); - mem_iter iter = memory_map.find((void *)ptr); + mem_iter iter = memory_map.find((void *)ptr); - if (iter != memory_map.end()) { + if (iter != memory_map.end()) { - iter->second.is_free = true; - if ((iter->second).is_unlinked) return; + iter->second.is_free = true; + if ((iter->second).is_unlinked) return; - used_bytes -= iter->second.bytes; - used_buffers--; + used_bytes -= iter->second.bytes; + used_buffers--; - } else { - freeWrapper(ptr); // Free it because we are not sure what the size is - } + } else { + freeWrapper(ptr); // Free it because we are not sure what the size is } +} - template - void memPop(const T *ptr) - { - std::lock_guard lock(memory_map_mutex); +template +void memPop(const T *ptr) +{ + std::lock_guard lock(memory_map_mutex); - mem_iter iter = memory_map.find((void *)ptr); + mem_iter iter = memory_map.find((void *)ptr); - if (iter != memory_map.end()) { - iter->second.is_unlinked = true; - } else { - mem_info info = { false, - true, - 100 }; //This number is not relevant + if (iter != memory_map.end()) { + iter->second.is_unlinked = true; + } else { + mem_info info = { false, + true, + 100 }; //This number is not relevant - memory_map[(void *)ptr] = info; - } + memory_map[(void *)ptr] = info; } +} - template - void memPush(const T *ptr) - { - std::lock_guard lock(memory_map_mutex); - mem_iter iter = memory_map.find((void *)ptr); - if (iter != memory_map.end()) { - iter->second.is_unlinked = false; - } +template +void memPush(const T *ptr) +{ + std::lock_guard lock(memory_map_mutex); + mem_iter iter = memory_map.find((void *)ptr); + if (iter != memory_map.end()) { + iter->second.is_unlinked = false; } +} - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) - { - getQueue().sync(); - if (alloc_bytes ) *alloc_bytes = total_bytes; - if (alloc_buffers ) *alloc_buffers = memory_map.size(); - if (lock_bytes ) *lock_bytes = used_bytes; - if (lock_buffers ) *lock_buffers = used_buffers; - } +void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + getQueue().sync(); + if (alloc_bytes ) *alloc_bytes = total_bytes; + if (alloc_buffers ) *alloc_buffers = memory_map.size(); + if (lock_bytes ) *lock_bytes = used_bytes; + if (lock_buffers ) *lock_buffers = used_buffers; +} - template - T* pinnedAlloc(const size_t &elements) - { - return memAlloc(elements); - } +template +T* pinnedAlloc(const size_t &elements) +{ + return memAlloc(elements); +} - template - void pinnedFree(T* ptr) - { - memFree(ptr); - } +template +void pinnedFree(T* ptr) +{ + memFree(ptr); +} #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ @@ -234,16 +234,17 @@ namespace cpu template T* pinnedAlloc(const size_t &elements); \ template void pinnedFree(T* ptr); \ - INSTANTIATE(float) - INSTANTIATE(cfloat) - INSTANTIATE(double) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(ushort) - INSTANTIATE(short ) +INSTANTIATE(float) +INSTANTIATE(cfloat) +INSTANTIATE(double) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(ushort) +INSTANTIATE(short ) + } diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp index 97f0e0a8f0..b6f50c2e32 100644 --- a/src/backend/cpu/nearest_neighbour.cpp +++ b/src/backend/cpu/nearest_neighbour.cpp @@ -163,8 +163,6 @@ void nearest_neighbour(Array& idx, Array& dist, idx = createEmptyArray(outDims); dist = createEmptyArray(outDims); - idx.eval(); - dist.eval(); switch(dist_type) { case AF_SAD: diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp index b5f18064f5..78631fccfa 100644 --- a/src/backend/cpu/qr.cpp +++ b/src/backend/cpu/qr.cpp @@ -59,6 +59,8 @@ GQR_FUNC(gqr , cdouble, zungqr) template void qr(Array &q, Array &r, Array &t, const Array &in) { + in.eval(); + dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; @@ -83,6 +85,8 @@ void qr(Array &q, Array &r, Array &t, const Array &in) template Array qr_inplace(Array &in) { + in.eval(); + dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; @@ -121,6 +125,7 @@ Array qr_inplace(Array &in) namespace cpu { + #define INSTANTIATE_QR(T) \ template Array qr_inplace(Array &in); \ template void qr(Array &q, Array &r, Array &t, const Array &in); @@ -129,4 +134,5 @@ INSTANTIATE_QR(float) INSTANTIATE_QR(cfloat) INSTANTIATE_QR(double) INSTANTIATE_QR(cdouble) + } diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp index 1fa46b2e89..7837db51ff 100644 --- a/src/backend/cpu/range.cpp +++ b/src/backend/cpu/range.cpp @@ -19,6 +19,7 @@ namespace cpu { + /////////////////////////////////////////////////////////////////////////// // Kernel Functions /////////////////////////////////////////////////////////////////////////// @@ -90,4 +91,5 @@ INSTANTIATE(uintl) INSTANTIATE(uchar) INSTANTIATE(ushort) INSTANTIATE(short) + } diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp index afe562001d..1ad7dad6dc 100644 --- a/src/backend/cpu/reorder.cpp +++ b/src/backend/cpu/reorder.cpp @@ -16,70 +16,70 @@ namespace cpu { - template - void reorder_(Array out, const Array in, const af::dim4 oDims, const af::dim4 rdims) - { - T* outPtr = out.get(); - const T* inPtr = in.get(); - const af::dim4 ist = in.strides(); - const af::dim4 ost = out.strides(); +template +void reorder_(Array out, const Array in, const af::dim4 oDims, const af::dim4 rdims) +{ + T* outPtr = out.get(); + const T* inPtr = in.get(); + + const af::dim4 ist = in.strides(); + const af::dim4 ost = out.strides(); - dim_t ids[4] = {0}; - for(dim_t ow = 0; ow < oDims[3]; ow++) { - const dim_t oW = ow * ost[3]; - ids[rdims[3]] = ow; - for(dim_t oz = 0; oz < oDims[2]; oz++) { - const dim_t oZW = oW + oz * ost[2]; - ids[rdims[2]] = oz; - for(dim_t oy = 0; oy < oDims[1]; oy++) { - const dim_t oYZW = oZW + oy * ost[1]; - ids[rdims[1]] = oy; - for(dim_t ox = 0; ox < oDims[0]; ox++) { - const dim_t oIdx = oYZW + ox; + dim_t ids[4] = {0}; + for(dim_t ow = 0; ow < oDims[3]; ow++) { + const dim_t oW = ow * ost[3]; + ids[rdims[3]] = ow; + for(dim_t oz = 0; oz < oDims[2]; oz++) { + const dim_t oZW = oW + oz * ost[2]; + ids[rdims[2]] = oz; + for(dim_t oy = 0; oy < oDims[1]; oy++) { + const dim_t oYZW = oZW + oy * ost[1]; + ids[rdims[1]] = oy; + for(dim_t ox = 0; ox < oDims[0]; ox++) { + const dim_t oIdx = oYZW + ox; - ids[rdims[0]] = ox; - const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] + - ids[1] * ist[1] + ids[0]; + ids[rdims[0]] = ox; + const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] + + ids[1] * ist[1] + ids[0]; - outPtr[oIdx] = inPtr[iIdx]; - } + outPtr[oIdx] = inPtr[iIdx]; } } } } +} - template - Array reorder(const Array &in, const af::dim4 &rdims) - { - in.eval(); +template +Array reorder(const Array &in, const af::dim4 &rdims) +{ + in.eval(); - const af::dim4 iDims = in.dims(); - af::dim4 oDims(0); - for(int i = 0; i < 4; i++) - oDims[i] = iDims[rdims[i]]; + const af::dim4 iDims = in.dims(); + af::dim4 oDims(0); + for(int i = 0; i < 4; i++) + oDims[i] = iDims[rdims[i]]; - Array out = createEmptyArray(oDims); - getQueue().enqueue(reorder_, out, in, oDims, rdims); - return out; - } + Array out = createEmptyArray(oDims); + getQueue().enqueue(reorder_, out, in, oDims, rdims); + return out; +} #define INSTANTIATE(T) \ template Array reorder(const Array &in, const af::dim4 &rdims); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(short) - INSTANTIATE(ushort) - +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(short) +INSTANTIATE(ushort) } diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp index 160ed46c0d..8fb2edcda6 100644 --- a/src/backend/cpu/resize.cpp +++ b/src/backend/cpu/resize.cpp @@ -19,6 +19,7 @@ namespace cpu { + /** * noop function for round to avoid compilation * issues due to lack of this function in C90 based @@ -215,4 +216,5 @@ INSTANTIATE(uchar) INSTANTIATE(char) INSTANTIATE(short) INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp index 01ec96228c..5687d69c08 100644 --- a/src/backend/cpu/rotate.cpp +++ b/src/backend/cpu/rotate.cpp @@ -18,106 +18,110 @@ namespace cpu { - template - void rotate_(Array output, const Array input, const float theta) + +template +void rotate_(Array output, const Array input, const float theta) +{ + const af::dim4 odims = output.dims(); + const af::dim4 idims = input.dims(); + const af::dim4 ostrides = output.strides(); + const af::dim4 istrides = input.strides(); + + const T* in = input.get(); + T* out = output.get(); + dim_t nimages = idims[2]; + + void (*t_fn)(T *, const T *, const float *, const af::dim4 &, + const af::dim4 &, const af::dim4 &, + const dim_t, const dim_t, const dim_t, const dim_t); + + const float c = cos(-theta), s = sin(-theta); + float tx, ty; { - const af::dim4 odims = output.dims(); - const af::dim4 idims = input.dims(); - const af::dim4 ostrides = output.strides(); - const af::dim4 istrides = input.strides(); - - const T* in = input.get(); - T* out = output.get(); - dim_t nimages = idims[2]; - - void (*t_fn)(T *, const T *, const float *, const af::dim4 &, - const af::dim4 &, const af::dim4 &, - const dim_t, const dim_t, const dim_t, const dim_t); - - const float c = cos(-theta), s = sin(-theta); - float tx, ty; - { - const float nx = 0.5 * (idims[0] - 1); - const float ny = 0.5 * (idims[1] - 1); - const float mx = 0.5 * (odims[0] - 1); - const float my = 0.5 * (odims[1] - 1); - const float sx = (mx * c + my *-s); - const float sy = (mx * s + my * c); - tx = -(sx - nx); - ty = -(sy - ny); - } + const float nx = 0.5 * (idims[0] - 1); + const float ny = 0.5 * (idims[1] - 1); + const float mx = 0.5 * (odims[0] - 1); + const float my = 0.5 * (odims[1] - 1); + const float sx = (mx * c + my *-s); + const float sy = (mx * s + my * c); + tx = -(sx - nx); + ty = -(sy - ny); + } - const float tmat[6] = {std::round( c * 1000) / 1000.0f, - std::round(-s * 1000) / 1000.0f, - std::round(tx * 1000) / 1000.0f, - std::round( s * 1000) / 1000.0f, - std::round( c * 1000) / 1000.0f, - std::round(ty * 1000) / 1000.0f, - }; - - switch(method) { - case AF_INTERP_NEAREST: - t_fn = &transform_n; - break; - case AF_INTERP_BILINEAR: - t_fn = &transform_b; - break; - case AF_INTERP_LOWER: - t_fn = &transform_l; - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } + const float tmat[6] = {std::round( c * 1000) / 1000.0f, + std::round(-s * 1000) / 1000.0f, + std::round(tx * 1000) / 1000.0f, + std::round( s * 1000) / 1000.0f, + std::round( c * 1000) / 1000.0f, + std::round(ty * 1000) / 1000.0f, + }; + + switch(method) { + case AF_INTERP_NEAREST: + t_fn = &transform_n; + break; + case AF_INTERP_BILINEAR: + t_fn = &transform_b; + break; + case AF_INTERP_LOWER: + t_fn = &transform_l; + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } - // Do transform for image - for(int yy = 0; yy < (int)odims[1]; yy++) { - for(int xx = 0; xx < (int)odims[0]; xx++) { - t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy); - } + // Do transform for image + for(int yy = 0; yy < (int)odims[1]; yy++) { + for(int xx = 0; xx < (int)odims[0]; xx++) { + t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy); } } +} - template - Array rotate(const Array &in, const float theta, const af::dim4 &odims, - const af_interp_type method) - { - Array out = createEmptyArray(odims); - - switch(method) { - case AF_INTERP_NEAREST: - getQueue().enqueue(rotate_, out, in, theta); - break; - case AF_INTERP_BILINEAR: - getQueue().enqueue(rotate_, out, in, theta); - break; - case AF_INTERP_LOWER: - getQueue().enqueue(rotate_, out, in, theta); - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } +template +Array rotate(const Array &in, const float theta, const af::dim4 &odims, + const af_interp_type method) +{ + in.eval(); - return out; + Array out = createEmptyArray(odims); + + switch(method) { + case AF_INTERP_NEAREST: + getQueue().enqueue(rotate_, out, in, theta); + break; + case AF_INTERP_BILINEAR: + getQueue().enqueue(rotate_, out, in, theta); + break; + case AF_INTERP_LOWER: + getQueue().enqueue(rotate_, out, in, theta); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; } + return out; +} + #define INSTANTIATE(T) \ template Array rotate(const Array &in, const float theta, \ const af::dim4 &odims, const af_interp_type method); - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(uchar) - INSTANTIATE(char) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(uchar) +INSTANTIATE(char) +INSTANTIATE(short) +INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp index 67aa5863ea..d6321bba55 100644 --- a/src/backend/cpu/set.cpp +++ b/src/backend/cpu/set.cpp @@ -23,116 +23,118 @@ namespace cpu { - using namespace std; - using af::dim4; - - template - Array setUnique(const Array &in, - const bool is_sorted) - { - in.eval(); - - Array out = createEmptyArray(af::dim4()); - if (is_sorted) out = copyArray(in); - else out = sort(in, 0); - - // Need to sync old jobs since we need to - // operator on pointers directly in std::unique - getQueue().sync(); - - T *ptr = out.get(); - T *last = std::unique(ptr, ptr + in.elements()); - dim_t dist = (dim_t)std::distance(ptr, last); - - dim4 dims(dist, 1, 1, 1); - out.resetDims(dims); - return out; - } - template - Array setUnion(const Array &first, - const Array &second, - const bool is_unique) - { - first.eval(); - second.eval(); - getQueue().sync(); +using namespace std; +using af::dim4; - Array uFirst = first; - Array uSecond = second; +template +Array setUnique(const Array &in, + const bool is_sorted) +{ + in.eval(); - if (!is_unique) { - // FIXME: Perhaps copy + unique would do ? - uFirst = setUnique(first, false); - uSecond = setUnique(second, false); - } + Array out = createEmptyArray(af::dim4()); + if (is_sorted) out = copyArray(in); + else out = sort(in, 0); - dim_t first_elements = uFirst.elements(); - dim_t second_elements = uSecond.elements(); - dim_t elements = first_elements + second_elements; + // Need to sync old jobs since we need to + // operator on pointers directly in std::unique + getQueue().sync(); - Array out = createEmptyArray(af::dim4(elements)); + T *ptr = out.get(); + T *last = std::unique(ptr, ptr + in.elements()); + dim_t dist = (dim_t)std::distance(ptr, last); - T *ptr = out.get(); - T *last = std::set_union(uFirst.get() , uFirst.get() + first_elements, - uSecond.get(), uSecond.get() + second_elements, - ptr); + dim4 dims(dist, 1, 1, 1); + out.resetDims(dims); + return out; +} - dim_t dist = (dim_t)std::distance(ptr, last); - dim4 dims(dist, 1, 1, 1); - out.resetDims(dims); +template +Array setUnion(const Array &first, + const Array &second, + const bool is_unique) +{ + first.eval(); + second.eval(); + getQueue().sync(); - return out; + Array uFirst = first; + Array uSecond = second; + + if (!is_unique) { + // FIXME: Perhaps copy + unique would do ? + uFirst = setUnique(first, false); + uSecond = setUnique(second, false); } - template - Array setIntersect(const Array &first, - const Array &second, - const bool is_unique) - { - first.eval(); - second.eval(); - getQueue().sync(); + dim_t first_elements = uFirst.elements(); + dim_t second_elements = uSecond.elements(); + dim_t elements = first_elements + second_elements; - Array uFirst = first; - Array uSecond = second; + Array out = createEmptyArray(af::dim4(elements)); - if (!is_unique) { - uFirst = setUnique(first, false); - uSecond = setUnique(second, false); - } + T *ptr = out.get(); + T *last = std::set_union(uFirst.get() , uFirst.get() + first_elements, + uSecond.get(), uSecond.get() + second_elements, + ptr); - dim_t first_elements = uFirst.elements(); - dim_t second_elements = uSecond.elements(); - dim_t elements = std::max(first_elements, second_elements); + dim_t dist = (dim_t)std::distance(ptr, last); + dim4 dims(dist, 1, 1, 1); + out.resetDims(dims); - Array out = createEmptyArray(af::dim4(elements)); + return out; +} - T *ptr = out.get(); - T *last = std::set_intersection(uFirst.get() , uFirst.get() + first_elements, - uSecond.get(), uSecond.get() + second_elements, - ptr); +template +Array setIntersect(const Array &first, + const Array &second, + const bool is_unique) +{ + first.eval(); + second.eval(); + getQueue().sync(); - dim_t dist = (dim_t)std::distance(ptr, last); - dim4 dims(dist, 1, 1, 1); - out.resetDims(dims); + Array uFirst = first; + Array uSecond = second; - return out; + if (!is_unique) { + uFirst = setUnique(first, false); + uSecond = setUnique(second, false); } + dim_t first_elements = uFirst.elements(); + dim_t second_elements = uSecond.elements(); + dim_t elements = std::max(first_elements, second_elements); + + Array out = createEmptyArray(af::dim4(elements)); + + T *ptr = out.get(); + T *last = std::set_intersection(uFirst.get() , uFirst.get() + first_elements, + uSecond.get(), uSecond.get() + second_elements, + ptr); + + dim_t dist = (dim_t)std::distance(ptr, last); + dim4 dims(dist, 1, 1, 1); + out.resetDims(dims); + + return out; +} + #define INSTANTIATE(T) \ template Array setUnique(const Array &in, const bool is_sorted); \ template Array setUnion(const Array &first, const Array &second, const bool is_unique); \ template Array setIntersect(const Array &first, const Array &second, const bool is_unique); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) - INSTANTIATE(intl) - INSTANTIATE(uintl) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) +INSTANTIATE(intl) +INSTANTIATE(uintl) + } diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp index 6a2b939cca..766427bff5 100644 --- a/src/backend/cpu/shift.cpp +++ b/src/backend/cpu/shift.cpp @@ -17,6 +17,7 @@ namespace cpu { + static inline dim_t simple_mod(const dim_t i, const dim_t dim) { return (i < dim) ? i : (i - dim); @@ -25,9 +26,9 @@ static inline dim_t simple_mod(const dim_t i, const dim_t dim) template Array shift(const Array &in, const int sdims[4]) { - Array out = createEmptyArray(in.dims()); - out.eval(); in.eval(); + + Array out = createEmptyArray(in.dims()); const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]); auto func = [=] (Array out, const Array in, const af::dim4 sdims) { @@ -91,4 +92,5 @@ INSTANTIATE(uchar) INSTANTIATE(char) INSTANTIATE(short) INSTANTIATE(ushort) + } diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/sift_nonfree.hpp index 853f407f7f..c1c92a97e6 100644 --- a/src/backend/cpu/sift_nonfree.hpp +++ b/src/backend/cpu/sift_nonfree.hpp @@ -76,161 +76,161 @@ using af::dim4; namespace cpu { - static const float PI_VAL = 3.14159265358979323846f; +static const float PI_VAL = 3.14159265358979323846f; // default width of descriptor histogram array - static const int DescrWidth = 4; +static const int DescrWidth = 4; // default number of bins per histogram in descriptor array - static const int DescrHistBins = 8; +static const int DescrHistBins = 8; // assumed gaussian blur for input image - static const float InitSigma = 0.5f; +static const float InitSigma = 0.5f; // width of border in which to ignore keypoints - static const int ImgBorder = 5; +static const int ImgBorder = 5; // maximum steps of keypoint interpolation before failure - static const int MaxInterpSteps = 5; +static const int MaxInterpSteps = 5; // default number of bins in histogram for orientation assignment - static const int OriHistBins = 36; +static const int OriHistBins = 36; // determines gaussian sigma for orientation assignment - static const float OriSigFctr = 1.5f; +static const float OriSigFctr = 1.5f; // determines the radius of the region used in orientation assignment */ - static const float OriRadius = 3.0f * OriSigFctr; +static const float OriRadius = 3.0f * OriSigFctr; // number of passes of orientation histogram smoothing - static const int SmoothOriPasses = 2; +static const int SmoothOriPasses = 2; // orientation magnitude relative to max that results in new feature - static const float OriPeakRatio = 0.8f; +static const float OriPeakRatio = 0.8f; // determines the size of a single descriptor orientation histogram - static const float DescrSclFctr = 3.f; +static const float DescrSclFctr = 3.f; // threshold on magnitude of elements of descriptor vector - static const float DescrMagThr = 0.2f; +static const float DescrMagThr = 0.2f; // factor used to convert floating-point descriptor to unsigned char - static const float IntDescrFctr = 512.f; +static const float IntDescrFctr = 512.f; // Number of GLOH bins in radial direction - static const unsigned GLOHRadialBins = 3; +static const unsigned GLOHRadialBins = 3; // Radiuses of GLOH descriptors - static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f}; +static const float GLOHRadii[GLOHRadialBins] = {6.f, 11.f, 15.f}; // Number of GLOH angular bins (excluding the inner-most radial section) - static const unsigned GLOHAngularBins = 8; +static const unsigned GLOHAngularBins = 8; // Number of GLOH bins per histogram in descriptor - static const unsigned GLOHHistBins = 16; +static const unsigned GLOHHistBins = 16; - typedef struct - { - float f[4]; - unsigned l; - } feat_t; +typedef struct +{ + float f[4]; + unsigned l; +} feat_t; - bool feat_cmp(feat_t i, feat_t j) - { - for (int k = 0; k < 4; k++) - if (i.f[k] != j.f[k]) - return (i.f[k] < j.f[k]); - if (i.l != j.l) - return (i.l < j.l); +bool feat_cmp(feat_t i, feat_t j) +{ + for (int k = 0; k < 4; k++) + if (i.f[k] != j.f[k]) + return (i.f[k] < j.f[k]); + if (i.l != j.l) + return (i.l < j.l); - return true; - } + return true; +} - void array_to_feat(std::vector& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat) - { - feat.resize(nfeat); - for (unsigned i = 0; i < feat.size(); i++) { - feat[i].f[0] = x[i]; - feat[i].f[1] = y[i]; - feat[i].f[2] = resp[i]; - feat[i].f[3] = size[i]; - feat[i].l = layer[i]; - } +void array_to_feat(std::vector& feat, float *x, float *y, unsigned *layer, float *resp, float *size, unsigned nfeat) +{ + feat.resize(nfeat); + for (unsigned i = 0; i < feat.size(); i++) { + feat[i].f[0] = x[i]; + feat[i].f[1] = y[i]; + feat[i].f[2] = resp[i]; + feat[i].f[3] = size[i]; + feat[i].l = layer[i]; } +} - template - void gaussian1D(T* out, const int dim, double sigma=0.0) - { - if(!(sigma>0)) sigma = 0.25*dim; - - T sum = (T)0; - for(int i=0;i +void gaussian1D(T* out, const int dim, double sigma=0.0) +{ + if(!(sigma>0)) sigma = 0.25*dim; - for(int k=0;k - Array gauss_filter(float sigma) - { - // Using 6-sigma rule - unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u); + for(int k=0;k filter = createEmptyArray(gauss_len); - gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma); +template +Array gauss_filter(float sigma) +{ + // Using 6-sigma rule + unsigned gauss_len = std::min((unsigned)round(sigma * 6 + 1) | 1, 31u); - return filter; - } + Array filter = createEmptyArray(gauss_len); + gaussian1D((T*)getDevicePtr(filter), gauss_len, sigma); - template - void gaussianElimination(float* A, float* b, float* x) - { - // forward elimination - for (int i = 0; i < N-1; i++) { - for (int j = i+1; j < N; j++) { - float s = A[j*N+i] / A[i*N+i]; + return filter; +} - for (int k = i; k < N; k++) - A[j*N+k] -= s * A[i*N+k]; +template +void gaussianElimination(float* A, float* b, float* x) +{ + // forward elimination + for (int i = 0; i < N-1; i++) { + for (int j = i+1; j < N; j++) { + float s = A[j*N+i] / A[i*N+i]; - b[j] -= s * b[i]; - } + for (int k = i; k < N; k++) + A[j*N+k] -= s * A[i*N+k]; + + b[j] -= s * b[i]; } + } - for (int i = 0; i < N; i++) - x[i] = 0; + for (int i = 0; i < N; i++) + x[i] = 0; - // backward substitution - float sum = 0; - for (int i = 0; i <= N-2; i++) { - sum = b[i]; - for (int j = i+1; j < N; j++) - sum -= A[i*N+j] * x[j]; - x[i] = sum / A[i*N+i]; - } + // backward substitution + float sum = 0; + for (int i = 0; i <= N-2; i++) { + sum = b[i]; + for (int j = i+1; j < N; j++) + sum -= A[i*N+j] * x[j]; + x[i] = sum / A[i*N+i]; } +} - template - void sub( - Array& out, - const Array& in1, - const Array& in2) - { - size_t nel = in1.elements(); - T* out_ptr = out.get(); - const T* in1_ptr = in1.get(); - const T* in2_ptr = in2.get(); +template +void sub( + Array& out, + const Array& in1, + const Array& in2) +{ + size_t nel = in1.elements(); + T* out_ptr = out.get(); + const T* in1_ptr = in1.get(); + const T* in2_ptr = in2.get(); - for (size_t i = 0; i < nel; i++) { - out_ptr[i] = in1_ptr[i] - in2_ptr[i]; - } + for (size_t i = 0; i < nel; i++) { + out_ptr[i] = in1_ptr[i] - in2_ptr[i]; } +} #define CPTR(Y, X) (center_ptr[(Y) * idims[0] + (X)]) #define PPTR(Y, X) (prev_ptr[(Y) * idims[0] + (X)]) @@ -238,957 +238,958 @@ namespace cpu // Determines whether a pixel is a scale-space extremum by comparing it to its // 3x3x3 pixel neighborhood. - template - void detectExtrema( - float* x_out, - float* y_out, - unsigned* layer_out, - unsigned* counter, - const Array& prev, - const Array& center, - const Array& next, - const unsigned layer, - const unsigned max_feat, - const float threshold) - { - const af::dim4 idims = center.dims(); - const T* prev_ptr = prev.get(); - const T* center_ptr = center.get(); - const T* next_ptr = next.get(); - - for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) { - for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) { - float p = center_ptr[y*idims[0] + x]; - - // Find extrema - if (abs((float)p) > threshold && - ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) && - p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y, x+1) && - p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1) && - p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1) && - p > PPTR(y, x-1) && p > PPTR(y , x) && p > PPTR(y, x+1) && - p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1) && - p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1) && - p > NPTR(y, x-1) && p > NPTR(y , x) && p > NPTR(y, x+1) && - p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) || - (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) && - p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y, x+1) && - p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1) && - p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1) && - p < PPTR(y, x-1) && p < PPTR(y , x) && p < PPTR(y, x+1) && - p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1) && - p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1) && - p < NPTR(y, x-1) && p < NPTR(y , x) && p < NPTR(y, x+1) && - p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) { - - if (*counter < max_feat) - { - x_out[*counter] = (float)y; - y_out[*counter] = (float)x; - layer_out[*counter] = layer; - (*counter)++; - } +template +void detectExtrema( + float* x_out, + float* y_out, + unsigned* layer_out, + unsigned* counter, + const Array& prev, + const Array& center, + const Array& next, + const unsigned layer, + const unsigned max_feat, + const float threshold) +{ + const af::dim4 idims = center.dims(); + const T* prev_ptr = prev.get(); + const T* center_ptr = center.get(); + const T* next_ptr = next.get(); + + for (int y = ImgBorder; y < idims[1]-ImgBorder; y++) { + for (int x = ImgBorder; x < idims[0]-ImgBorder; x++) { + float p = center_ptr[y*idims[0] + x]; + + // Find extrema + if (abs((float)p) > threshold && + ((p > 0 && p > CPTR(y-1, x-1) && p > CPTR(y-1, x) && + p > CPTR(y-1, x+1) && p > CPTR(y, x-1) && p > CPTR(y, x+1) && + p > CPTR(y+1, x-1) && p > CPTR(y+1, x) && p > CPTR(y+1, x+1) && + p > PPTR(y-1, x-1) && p > PPTR(y-1, x) && p > PPTR(y-1, x+1) && + p > PPTR(y, x-1) && p > PPTR(y , x) && p > PPTR(y, x+1) && + p > PPTR(y+1, x-1) && p > PPTR(y+1, x) && p > PPTR(y+1, x+1) && + p > NPTR(y-1, x-1) && p > NPTR(y-1, x) && p > NPTR(y-1, x+1) && + p > NPTR(y, x-1) && p > NPTR(y , x) && p > NPTR(y, x+1) && + p > NPTR(y+1, x-1) && p > NPTR(y+1, x) && p > NPTR(y+1, x+1)) || + (p < 0 && p < CPTR(y-1, x-1) && p < CPTR(y-1, x) && + p < CPTR(y-1, x+1) && p < CPTR(y, x-1) && p < CPTR(y, x+1) && + p < CPTR(y+1, x-1) && p < CPTR(y+1, x) && p < CPTR(y+1, x+1) && + p < PPTR(y-1, x-1) && p < PPTR(y-1, x) && p < PPTR(y-1, x+1) && + p < PPTR(y, x-1) && p < PPTR(y , x) && p < PPTR(y, x+1) && + p < PPTR(y+1, x-1) && p < PPTR(y+1, x) && p < PPTR(y+1, x+1) && + p < NPTR(y-1, x-1) && p < NPTR(y-1, x) && p < NPTR(y-1, x+1) && + p < NPTR(y, x-1) && p < NPTR(y , x) && p < NPTR(y, x+1) && + p < NPTR(y+1, x-1) && p < NPTR(y+1, x) && p < NPTR(y+1, x+1)))) { + + if (*counter < max_feat) + { + x_out[*counter] = (float)y; + y_out[*counter] = (float)x; + layer_out[*counter] = layer; + (*counter)++; } } } } +} // Interpolates a scale-space extremum's location and scale to subpixel // accuracy to form an image feature. Rejects features with low contrast. // Based on Section 4 of Lowe's paper. - template - void interpolateExtrema( - float* x_out, - float* y_out, - unsigned* layer_out, - float* response_out, - float* size_out, - unsigned* counter, - const float* x_in, - const float* y_in, - const unsigned* layer_in, - const unsigned extrema_feat, - std::vector< Array >& dog_pyr, - const unsigned max_feat, - const unsigned octave, - const unsigned n_layers, - const float contrast_thr, - const float edge_thr, - const float sigma, - const float img_scale) - { - for (int f = 0; f < (int)extrema_feat; f++) { - const float first_deriv_scale = img_scale*0.5f; - const float second_deriv_scale = img_scale; - const float cross_deriv_scale = img_scale*0.25f; - - float xl = 0, xy = 0, xx = 0, contr = 0; - int i = 0; - - unsigned x = x_in[f]; - unsigned y = y_in[f]; - unsigned layer = layer_in[f]; - - const T* prev_ptr = dog_pyr[octave*(n_layers+2) + layer-1].get(); - const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get(); - const T* next_ptr = dog_pyr[octave*(n_layers+2) + layer+1].get(); - - af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims(); - - bool converges = true; - - for (i = 0; i < MaxInterpSteps; i++) { - float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale, - (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale, - (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale}; - - float d2 = CPTR(x, y) * 2.f; - float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale; - float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale; - float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale; - float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) - - CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale; - float dxs = (NPTR(x+1, y) - NPTR(x-1, y) - - PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale; - float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) - - PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale; - - float H[9] = {dxx, dxy, dxs, - dxy, dyy, dys, - dxs, dys, dss}; - - float X[3]; - gaussianElimination<3>(H, dD, X); - - xl = -X[2]; - xy = -X[1]; - xx = -X[0]; - - if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f) - break; - - x += round(xx); - y += round(xy); - layer += round(xl); - - if (layer < 1 || layer > n_layers || - x < ImgBorder || x >= idims[1] - ImgBorder || - y < ImgBorder || y >= idims[0] - ImgBorder) { - converges = false; - break; - } - } +template +void interpolateExtrema( + float* x_out, + float* y_out, + unsigned* layer_out, + float* response_out, + float* size_out, + unsigned* counter, + const float* x_in, + const float* y_in, + const unsigned* layer_in, + const unsigned extrema_feat, + std::vector< Array >& dog_pyr, + const unsigned max_feat, + const unsigned octave, + const unsigned n_layers, + const float contrast_thr, + const float edge_thr, + const float sigma, + const float img_scale) +{ + for (int f = 0; f < (int)extrema_feat; f++) { + const float first_deriv_scale = img_scale*0.5f; + const float second_deriv_scale = img_scale; + const float cross_deriv_scale = img_scale*0.25f; - // ensure convergence of interpolation - if (i >= MaxInterpSteps || !converges) - continue; + float xl = 0, xy = 0, xx = 0, contr = 0; + int i = 0; + + unsigned x = x_in[f]; + unsigned y = y_in[f]; + unsigned layer = layer_in[f]; + const T* prev_ptr = dog_pyr[octave*(n_layers+2) + layer-1].get(); + const T* center_ptr = dog_pyr[octave*(n_layers+2) + layer].get(); + const T* next_ptr = dog_pyr[octave*(n_layers+2) + layer+1].get(); + + af::dim4 idims = dog_pyr[octave*(n_layers+2)].dims(); + + bool converges = true; + + for (i = 0; i < MaxInterpSteps; i++) { float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale, (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale, (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale}; - float X[3] = {xx, xy, xl}; - float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2]; - - contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f; - if(abs(contr) < (contrast_thr / n_layers)) - continue; - - // principal curvatures are computed using the trace and det of Hessian float d2 = CPTR(x, y) * 2.f; float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale; float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale; + float dss = (NPTR(x, y ) + PPTR(x, y ) - d2) * second_deriv_scale; float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) - CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale; + float dxs = (NPTR(x+1, y) - NPTR(x-1, y) - + PPTR(x+1, y) + PPTR(x-1, y)) * cross_deriv_scale; + float dys = (NPTR(x, y+1) - NPTR(x-1, y-1) - + PPTR(x, y-1) + PPTR(x-1, y-1)) * cross_deriv_scale; + + float H[9] = {dxx, dxy, dxs, + dxy, dyy, dys, + dxs, dys, dss}; + + float X[3]; + gaussianElimination<3>(H, dD, X); + + xl = -X[2]; + xy = -X[1]; + xx = -X[0]; + + if (fabs(xl) < 0.5f && fabs(xy) < 0.5f && fabs(xx) < 0.5f) + break; + + x += round(xx); + y += round(xy); + layer += round(xl); + + if (layer < 1 || layer > n_layers || + x < ImgBorder || x >= idims[1] - ImgBorder || + y < ImgBorder || y >= idims[0] - ImgBorder) { + converges = false; + break; + } + } - float tr = dxx + dyy; - float det = dxx * dyy - dxy * dxy; + // ensure convergence of interpolation + if (i >= MaxInterpSteps || !converges) + continue; - // add FLT_EPSILON for double-precision compatibility - if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON) - continue; + float dD[3] = {(float)(CPTR(x+1, y) - CPTR(x-1, y)) * first_deriv_scale, + (float)(CPTR(x, y+1) - CPTR(x, y-1)) * first_deriv_scale, + (float)(NPTR(x, y) - PPTR(x, y)) * first_deriv_scale}; + float X[3] = {xx, xy, xl}; - if (*counter < max_feat) - { - x_out[*counter] = (x + xx) * (1 << octave); - y_out[*counter] = (y + xy) * (1 << octave); - layer_out[*counter] = layer; - response_out[*counter] = abs(contr); - size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f; - (*counter)++; - } + float P = dD[0]*X[0] + dD[1]*X[1] + dD[2]*X[2]; + + contr = center_ptr[x*idims[0]+y]*img_scale + P * 0.5f; + if(abs(contr) < (contrast_thr / n_layers)) + continue; + + // principal curvatures are computed using the trace and det of Hessian + float d2 = CPTR(x, y) * 2.f; + float dxx = (CPTR(x+1, y) + CPTR(x-1, y) - d2) * second_deriv_scale; + float dyy = (CPTR(x, y+1) + CPTR(x, y-1) - d2) * second_deriv_scale; + float dxy = (CPTR(x+1, y+1) - CPTR(x-1, y+1) - + CPTR(x+1, y-1) + CPTR(x-1, y-1)) * cross_deriv_scale; + + float tr = dxx + dyy; + float det = dxx * dyy - dxy * dxy; + + // add FLT_EPSILON for double-precision compatibility + if (det <= 0 || tr*tr*edge_thr >= (edge_thr + 1)*(edge_thr + 1)*det+FLT_EPSILON) + continue; + + if (*counter < max_feat) + { + x_out[*counter] = (x + xx) * (1 << octave); + y_out[*counter] = (y + xy) * (1 << octave); + layer_out[*counter] = layer; + response_out[*counter] = abs(contr); + size_out[*counter] = sigma*pow(2.f, octave + (layer + xl) / n_layers) * 2.f; + (*counter)++; } } +} #undef CPTR #undef PPTR #undef NPTR // Remove duplicate keypoints - void removeDuplicates( - float* x_out, - float* y_out, - unsigned* layer_out, - float* response_out, - float* size_out, - unsigned* counter, - const std::vector& sorted_feat) - { - size_t nfeat = sorted_feat.size(); - - for (size_t f = 0; f < nfeat; f++) { - float prec_fctr = 1e4f; - - if (f < nfeat-1) { - if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) && - round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) && - round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) && - round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) && - sorted_feat[f].l == sorted_feat[f+1].l) - continue; - } +void removeDuplicates( + float* x_out, + float* y_out, + unsigned* layer_out, + float* response_out, + float* size_out, + unsigned* counter, + const std::vector& sorted_feat) +{ + size_t nfeat = sorted_feat.size(); - x_out[*counter] = sorted_feat[f].f[0]; - y_out[*counter] = sorted_feat[f].f[1]; - response_out[*counter] = sorted_feat[f].f[2]; - size_out[*counter] = sorted_feat[f].f[3]; - layer_out[*counter] = sorted_feat[f].l; - (*counter)++; + for (size_t f = 0; f < nfeat; f++) { + float prec_fctr = 1e4f; + + if (f < nfeat-1) { + if (round(sorted_feat[f].f[0]*prec_fctr) == round(sorted_feat[f+1].f[0]*prec_fctr) && + round(sorted_feat[f].f[1]*prec_fctr) == round(sorted_feat[f+1].f[1]*prec_fctr) && + round(sorted_feat[f].f[2]*prec_fctr) == round(sorted_feat[f+1].f[2]*prec_fctr) && + round(sorted_feat[f].f[3]*prec_fctr) == round(sorted_feat[f+1].f[3]*prec_fctr) && + sorted_feat[f].l == sorted_feat[f+1].l) + continue; } + + x_out[*counter] = sorted_feat[f].f[0]; + y_out[*counter] = sorted_feat[f].f[1]; + response_out[*counter] = sorted_feat[f].f[2]; + size_out[*counter] = sorted_feat[f].f[3]; + layer_out[*counter] = sorted_feat[f].l; + (*counter)++; } +} #define IPTR(Y, X) (img_ptr[(Y) * idims[0] + (X)]) // Computes a canonical orientation for each image feature in an array. Based // on Section 5 of Lowe's paper. This function adds features to the array when // there is more than one dominant orientation at a given feature location. - template - void calcOrientation( - float* x_out, - float* y_out, - unsigned* layer_out, - float* response_out, - float* size_out, - float* ori_out, - unsigned* counter, - const float* x_in, - const float* y_in, - const unsigned* layer_in, - const float* response_in, - const float* size_in, - const unsigned total_feat, - const std::vector< Array >& gauss_pyr, - const unsigned max_feat, - const unsigned octave, - const unsigned n_layers, - const bool double_input) - { - const int n = OriHistBins; +template +void calcOrientation( + float* x_out, + float* y_out, + unsigned* layer_out, + float* response_out, + float* size_out, + float* ori_out, + unsigned* counter, + const float* x_in, + const float* y_in, + const unsigned* layer_in, + const float* response_in, + const float* size_in, + const unsigned total_feat, + const std::vector< Array >& gauss_pyr, + const unsigned max_feat, + const unsigned octave, + const unsigned n_layers, + const bool double_input) +{ + const int n = OriHistBins; - float hist[OriHistBins]; - float temphist[OriHistBins]; + float hist[OriHistBins]; + float temphist[OriHistBins]; - for (unsigned f = 0; f < total_feat; f++) { - // Load keypoint information - const float real_x = x_in[f]; - const float real_y = y_in[f]; - const unsigned layer = layer_in[f]; - const float response = response_in[f]; - const float size = size_in[f]; + for (unsigned f = 0; f < total_feat; f++) { + // Load keypoint information + const float real_x = x_in[f]; + const float real_y = y_in[f]; + const unsigned layer = layer_in[f]; + const float response = response_in[f]; + const float size = size_in[f]; - const int pt_x = (int)round(real_x / (1 << octave)); - const int pt_y = (int)round(real_y / (1 << octave)); + const int pt_x = (int)round(real_x / (1 << octave)); + const int pt_y = (int)round(real_y / (1 << octave)); - // Calculate auxiliary parameters - const float scl_octv = size*0.5f / (1 << octave); - const int radius = (int)round(OriRadius * scl_octv); - const float sigma = OriSigFctr * scl_octv; - const int len = (radius*2+1); - const float exp_denom = 2.f * sigma * sigma; + // Calculate auxiliary parameters + const float scl_octv = size*0.5f / (1 << octave); + const int radius = (int)round(OriRadius * scl_octv); + const float sigma = OriSigFctr * scl_octv; + const int len = (radius*2+1); + const float exp_denom = 2.f * sigma * sigma; - // Points img to correct Gaussian pyramid layer - const Array img = gauss_pyr[octave*(n_layers+3) + layer]; - const T* img_ptr = img.get(); + // Points img to correct Gaussian pyramid layer + const Array img = gauss_pyr[octave*(n_layers+3) + layer]; + const T* img_ptr = img.get(); - for (int i = 0; i < OriHistBins; i++) - hist[i] = 0.f; + for (int i = 0; i < OriHistBins; i++) + hist[i] = 0.f; - af::dim4 idims = img.dims(); + af::dim4 idims = img.dims(); - // Calculate orientation histogram - for (int l = 0; l < len*len; l++) { - int i = l / len - radius; - int j = l % len - radius; + // Calculate orientation histogram + for (int l = 0; l < len*len; l++) { + int i = l / len - radius; + int j = l % len - radius; - int y = pt_y + i; - int x = pt_x + j; - if (y < 1 || y >= idims[0] - 1 || - x < 1 || x >= idims[1] - 1) - continue; + int y = pt_y + i; + int x = pt_x + j; + if (y < 1 || y >= idims[0] - 1 || + x < 1 || x >= idims[1] - 1) + continue; - float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); - float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); + float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); + float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); - float mag = sqrt(dx*dx+dy*dy); - float ori = atan2(dy,dx); - float w = exp(-(i*i + j*j)/exp_denom); + float mag = sqrt(dx*dx+dy*dy); + float ori = atan2(dy,dx); + float w = exp(-(i*i + j*j)/exp_denom); - int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL)); - bin = bin < n ? bin : 0; + int bin = round(n*(ori+PI_VAL)/(2.f*PI_VAL)); + bin = bin < n ? bin : 0; - hist[bin] += w*mag; - } + hist[bin] += w*mag; + } - for (int i = 0; i < SmoothOriPasses; i++) { - for (int j = 0; j < n; j++) { - temphist[j] = hist[j]; - } - for (int j = 0; j < n; j++) { - float prev = (j == 0) ? temphist[n-1] : temphist[j-1]; - float next = (j+1 == n) ? temphist[0] : temphist[j+1]; - hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next; - } + for (int i = 0; i < SmoothOriPasses; i++) { + for (int j = 0; j < n; j++) { + temphist[j] = hist[j]; } - - float omax = hist[0]; - for (int i = 1; i < n; i++) - omax = max(omax, hist[i]); - - float mag_thr = (float)(omax * OriPeakRatio); - int l, r; for (int j = 0; j < n; j++) { - l = (j == 0) ? n - 1 : j - 1; - r = (j + 1) % n; - if (hist[j] > hist[l] && - hist[j] > hist[r] && - hist[j] >= mag_thr) { - if (*counter < max_feat) { - float bin = j + 0.5f * (hist[l] - hist[r]) / - (hist[l] - 2.0f*hist[j] + hist[r]); - bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin; - float ori = 360.f - ((360.f/n) * bin); - - float new_real_x = real_x; - float new_real_y = real_y; - float new_size = size; - - if (double_input) { - float scale = 0.5f; - new_real_x *= scale; - new_real_y *= scale; - new_size *= scale; - } + float prev = (j == 0) ? temphist[n-1] : temphist[j-1]; + float next = (j+1 == n) ? temphist[0] : temphist[j+1]; + hist[j] = 0.25f * prev + 0.5f * temphist[j] + 0.25f * next; + } + } - x_out[*counter] = new_real_x; - y_out[*counter] = new_real_y; - layer_out[*counter] = layer; - response_out[*counter] = response; - size_out[*counter] = new_size; - ori_out[*counter] = ori; - (*counter)++; + float omax = hist[0]; + for (int i = 1; i < n; i++) + omax = max(omax, hist[i]); + + float mag_thr = (float)(omax * OriPeakRatio); + int l, r; + for (int j = 0; j < n; j++) { + l = (j == 0) ? n - 1 : j - 1; + r = (j + 1) % n; + if (hist[j] > hist[l] && + hist[j] > hist[r] && + hist[j] >= mag_thr) { + if (*counter < max_feat) { + float bin = j + 0.5f * (hist[l] - hist[r]) / + (hist[l] - 2.0f*hist[j] + hist[r]); + bin = (bin < 0.0f) ? bin + n : (bin >= n) ? bin - n : bin; + float ori = 360.f - ((360.f/n) * bin); + + float new_real_x = real_x; + float new_real_y = real_y; + float new_size = size; + + if (double_input) { + float scale = 0.5f; + new_real_x *= scale; + new_real_y *= scale; + new_size *= scale; } + + x_out[*counter] = new_real_x; + y_out[*counter] = new_real_y; + layer_out[*counter] = layer; + response_out[*counter] = response; + size_out[*counter] = new_size; + ori_out[*counter] = ori; + (*counter)++; } } } } +} - void normalizeDesc( - float* desc, - const int histlen) - { - float len_sq = 0.0f; +void normalizeDesc( + float* desc, + const int histlen) +{ + float len_sq = 0.0f; - for (int i = 0; i < histlen; i++) - len_sq += desc[i] * desc[i]; + for (int i = 0; i < histlen; i++) + len_sq += desc[i] * desc[i]; - float len_inv = 1.0f / sqrt(len_sq); + float len_inv = 1.0f / sqrt(len_sq); - for (int i = 0; i < histlen; i++) { - desc[i] *= len_inv; - } + for (int i = 0; i < histlen; i++) { + desc[i] *= len_inv; } +} // Computes feature descriptors for features in an array. Based on Section 6 // of Lowe's paper. - template - void computeDescriptor( - float* desc_out, - const unsigned desc_len, - const float* x_in, - const float* y_in, - const unsigned* layer_in, - const float* response_in, - const float* size_in, - const float* ori_in, - const unsigned total_feat, - const std::vector< Array >& gauss_pyr, - const int d, - const int n, - const float scale, - const unsigned octave, - const unsigned n_layers) - { - float desc[128]; - - for (unsigned f = 0; f < total_feat; f++) { - const unsigned layer = layer_in[f]; - float ori = (360.f - ori_in[f]) * PI_VAL / 180.f; - ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori; - const float size = size_in[f]; - const int fx = round(x_in[f] * scale); - const int fy = round(y_in[f] * scale); - - // Points img to correct Gaussian pyramid layer - Array img = gauss_pyr[octave*(n_layers+3) + layer]; - const T* img_ptr = img.get(); - af::dim4 idims = img.dims(); - - float cos_t = cos(ori); - float sin_t = sin(ori); - float bins_per_rad = n / (PI_VAL * 2.f); - float exp_denom = d * d * 0.5f; - float hist_width = DescrSclFctr * size * scale * 0.5f; - int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f; - - int len = radius*2+1; - - for (int i = 0; i < (int)desc_len; i++) - desc[i] = 0.f; - - // Calculate orientation histogram - for (int l = 0; l < len*len; l++) { - int i = l / len - radius; - int j = l % len - radius; - - int y = fy + i; - int x = fx + j; - - float x_rot = (j * cos_t - i * sin_t) / hist_width; - float y_rot = (j * sin_t + i * cos_t) / hist_width; - float xbin = x_rot + d/2 - 0.5f; - float ybin = y_rot + d/2 - 0.5f; - - if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d && - y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) { - float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); - float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); - - float grad_mag = sqrt(dx*dx + dy*dy); - float grad_ori = atan2(dy, dx) - ori; - while (grad_ori < 0.0f) - grad_ori += PI_VAL*2; - while (grad_ori >= PI_VAL*2) - grad_ori -= PI_VAL*2; - - float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom); - float obin = grad_ori * bins_per_rad; - float mag = grad_mag*w; - - int x0 = floor(xbin); - int y0 = floor(ybin); - int o0 = floor(obin); - xbin -= x0; - ybin -= y0; - obin -= o0; - - for (int yl = 0; yl <= 1; yl++) { - int yb = y0 + yl; - if (yb >= 0 && yb < d) { - float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin); - for (int xl = 0; xl <= 1; xl++) { - int xb = x0 + xl; - if (xb >= 0 && xb < d) { - float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin); - for (int ol = 0; ol <= 1; ol++) { - int ob = (o0 + ol) % n; - float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin); - desc[(yb*d + xb)*n + ob] += v_o; - } - } - } - } - } - } - } +template +void computeDescriptor( + float* desc_out, + const unsigned desc_len, + const float* x_in, + const float* y_in, + const unsigned* layer_in, + const float* response_in, + const float* size_in, + const float* ori_in, + const unsigned total_feat, + const std::vector< Array >& gauss_pyr, + const int d, + const int n, + const float scale, + const unsigned octave, + const unsigned n_layers) +{ + float desc[128]; + + for (unsigned f = 0; f < total_feat; f++) { + const unsigned layer = layer_in[f]; + float ori = (360.f - ori_in[f]) * PI_VAL / 180.f; + ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori; + const float size = size_in[f]; + const int fx = round(x_in[f] * scale); + const int fy = round(y_in[f] * scale); + + // Points img to correct Gaussian pyramid layer + Array img = gauss_pyr[octave*(n_layers+3) + layer]; + const T* img_ptr = img.get(); + af::dim4 idims = img.dims(); - normalizeDesc(desc, desc_len); + float cos_t = cos(ori); + float sin_t = sin(ori); + float bins_per_rad = n / (PI_VAL * 2.f); + float exp_denom = d * d * 0.5f; + float hist_width = DescrSclFctr * size * scale * 0.5f; + int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f; - for (int i = 0; i < (int)desc_len; i++) - desc[i] = min(desc[i], DescrMagThr); + int len = radius*2+1; - normalizeDesc(desc, desc_len); + for (int i = 0; i < (int)desc_len; i++) + desc[i] = 0.f; - // Calculate final descriptor values - for (int k = 0; k < (int)desc_len; k++) { - desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr)); - } - } - } + // Calculate orientation histogram + for (int l = 0; l < len*len; l++) { + int i = l / len - radius; + int j = l % len - radius; -// Computes GLOH feature descriptors for features in an array. Based on Section III-B -// of Mikolajczyk and Schmid paper. - template - void computeGLOHDescriptor( - float* desc_out, - const unsigned desc_len, - const float* x_in, - const float* y_in, - const unsigned* layer_in, - const float* response_in, - const float* size_in, - const float* ori_in, - const unsigned total_feat, - const std::vector< Array >& gauss_pyr, - const int d, - const unsigned rb, - const unsigned ab, - const unsigned hb, - const float scale, - const unsigned octave, - const unsigned n_layers) - { - float desc[272]; - - for (unsigned f = 0; f < total_feat; f++) { - const unsigned layer = layer_in[f]; - float ori = (360.f - ori_in[f]) * PI_VAL / 180.f; - ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori; - const float size = size_in[f]; - const int fx = round(x_in[f] * scale); - const int fy = round(y_in[f] * scale); - - // Points img to correct Gaussian pyramid layer - Array img = gauss_pyr[octave*(n_layers+3) + layer]; - const T* img_ptr = img.get(); - af::dim4 idims = img.dims(); - - float cos_t = cos(ori); - float sin_t = sin(ori); - float hist_bins_per_rad = hb / (PI_VAL * 2.f); - float polar_bins_per_rad = ab / (PI_VAL * 2.f); - float exp_denom = GLOHRadii[rb-1] * 0.5f; - - float hist_width = DescrSclFctr * size * scale * 0.5f; - - // Keep same descriptor radius used for SIFT - int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f; - - // Alternative radius size calculation, changing the radius weight - // (rw) in the range of 0.25f-0.75f gives different results, - // increasing it tends to show a better recall rate but with a - // smaller amount of correct matches - //float rw = 0.5f; - //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f; - - int len = radius*2+1; - - for (int i = 0; i < (int)desc_len; i++) - desc[i] = 0.f; - - // Calculate orientation histogram - for (int l = 0; l < len*len; l++) { - int i = l / len - radius; - int j = l % len - radius; - - int y = fy + i; - int x = fx + j; - - float x_rot = (j * cos_t - i * sin_t); - float y_rot = (j * sin_t + i * cos_t); - - float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1]; - float theta = atan2(y_rot, x_rot); - while (theta < 0.0f) - theta += PI_VAL*2; - while (theta >= PI_VAL*2) - theta -= PI_VAL*2; - - float tbin = theta * polar_bins_per_rad; - float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] : - ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) : - min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON)); - - if (r <= GLOHRadii[rb-1] && - y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) { - float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); - float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); - - float grad_mag = sqrt(dx*dx + dy*dy); - float grad_ori = atan2(dy, dx) - ori; - while (grad_ori < 0.0f) - grad_ori += PI_VAL*2; - while (grad_ori >= PI_VAL*2) - grad_ori -= PI_VAL*2; - - float w = exp(-r / exp_denom); - float obin = grad_ori * hist_bins_per_rad; - float mag = grad_mag*w; - - int t0 = floor(tbin); - int r0 = floor(rbin); - int o0 = floor(obin); - tbin -= t0; - rbin -= r0; - obin -= o0; - - for (int rl = 0; rl <= 1; rl++) { - int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl); - float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin); - if (rb >= 0 && rb <= 2) { - for (int tl = 0; tl <= 1; tl++) { - int tb = (t0 + tl) % ab; - float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin); + int y = fy + i; + int x = fx + j; + + float x_rot = (j * cos_t - i * sin_t) / hist_width; + float y_rot = (j * sin_t + i * cos_t) / hist_width; + float xbin = x_rot + d/2 - 0.5f; + float ybin = y_rot + d/2 - 0.5f; + + if (ybin > -1.0f && ybin < d && xbin > -1.0f && xbin < d && + y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) { + float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); + float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); + + float grad_mag = sqrt(dx*dx + dy*dy); + float grad_ori = atan2(dy, dx) - ori; + while (grad_ori < 0.0f) + grad_ori += PI_VAL*2; + while (grad_ori >= PI_VAL*2) + grad_ori -= PI_VAL*2; + + float w = exp(-(x_rot*x_rot + y_rot*y_rot) / exp_denom); + float obin = grad_ori * bins_per_rad; + float mag = grad_mag*w; + + int x0 = floor(xbin); + int y0 = floor(ybin); + int o0 = floor(obin); + xbin -= x0; + ybin -= y0; + obin -= o0; + + for (int yl = 0; yl <= 1; yl++) { + int yb = y0 + yl; + if (yb >= 0 && yb < d) { + float v_y = mag * ((yl == 0) ? 1.0f - ybin : ybin); + for (int xl = 0; xl <= 1; xl++) { + int xb = x0 + xl; + if (xb >= 0 && xb < d) { + float v_x = v_y * ((xl == 0) ? 1.0f - xbin : xbin); for (int ol = 0; ol <= 1; ol++) { - int ob = (o0 + ol) % hb; - float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin); - unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob; - desc[idx] += v_o; + int ob = (o0 + ol) % n; + float v_o = v_x * ((ol == 0) ? 1.0f - obin : obin); + desc[(yb*d + xb)*n + ob] += v_o; } } } } } } + } - normalizeDesc(desc, desc_len); + normalizeDesc(desc, desc_len); - for (int i = 0; i < (int)desc_len; i++) - desc[i] = min(desc[i], DescrMagThr); + for (int i = 0; i < (int)desc_len; i++) + desc[i] = min(desc[i], DescrMagThr); - normalizeDesc(desc, desc_len); + normalizeDesc(desc, desc_len); - // Calculate final descriptor values - for (int k = 0; k < (int)desc_len; k++) { - desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr)); - } + // Calculate final descriptor values + for (int k = 0; k < (int)desc_len; k++) { + desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr)); } } +} -#undef IPTR - - template - Array createInitialImage( - const Array& img, - const float init_sigma, - const bool double_input) - { +// Computes GLOH feature descriptors for features in an array. Based on Section III-B +// of Mikolajczyk and Schmid paper. +template +void computeGLOHDescriptor( + float* desc_out, + const unsigned desc_len, + const float* x_in, + const float* y_in, + const unsigned* layer_in, + const float* response_in, + const float* size_in, + const float* ori_in, + const unsigned total_feat, + const std::vector< Array >& gauss_pyr, + const int d, + const unsigned rb, + const unsigned ab, + const unsigned hb, + const float scale, + const unsigned octave, + const unsigned n_layers) +{ + float desc[272]; + + for (unsigned f = 0; f < total_feat; f++) { + const unsigned layer = layer_in[f]; + float ori = (360.f - ori_in[f]) * PI_VAL / 180.f; + ori = (ori > PI_VAL) ? ori - PI_VAL*2 : ori; + const float size = size_in[f]; + const int fx = round(x_in[f] * scale); + const int fy = round(y_in[f] * scale); + + // Points img to correct Gaussian pyramid layer + Array img = gauss_pyr[octave*(n_layers+3) + layer]; + const T* img_ptr = img.get(); af::dim4 idims = img.dims(); - Array init_img = createEmptyArray(af::dim4()); + float cos_t = cos(ori); + float sin_t = sin(ori); + float hist_bins_per_rad = hb / (PI_VAL * 2.f); + float polar_bins_per_rad = ab / (PI_VAL * 2.f); + float exp_denom = GLOHRadii[rb-1] * 0.5f; - float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f) - : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f); + float hist_width = DescrSclFctr * size * scale * 0.5f; - Array filter = gauss_filter(s); + // Keep same descriptor radius used for SIFT + int radius = hist_width * sqrt(2.f) * (d + 1.f) * 0.5f + 0.5f; - if (double_input) { - Array double_img = resize(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR); - init_img = convolve2(double_img, filter, filter); - } - else { - init_img = convolve2(img, filter, filter); - } + // Alternative radius size calculation, changing the radius weight + // (rw) in the range of 0.25f-0.75f gives different results, + // increasing it tends to show a better recall rate but with a + // smaller amount of correct matches + //float rw = 0.5f; + //int radius = hist_width * GLOHRadii[rb-1] * rw + 0.5f; - return init_img; - } + int len = radius*2+1; - template - std::vector< Array > buildGaussPyr( - const Array& init_img, - const unsigned n_octaves, - const unsigned n_layers, - const float init_sigma) - { - // Precompute Gaussian sigmas using the following formula: - // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2 - std::vector sig_layers(n_layers + 3); - sig_layers[0] = init_sigma; - float k = std::pow(2.0f, 1.0f / n_layers); - for (unsigned i = 1; i < n_layers + 3; i++) { - float sig_prev = std::pow(k, i-1) * init_sigma; - float sig_total = sig_prev * k; - sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev); - } + for (int i = 0; i < (int)desc_len; i++) + desc[i] = 0.f; - // Gaussian Pyramid - std::vector< Array > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray(af::dim4())); - for (unsigned o = 0; o < n_octaves; o++) { - for (unsigned l = 0; l < n_layers+3; l++) { - unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1; - unsigned idx = o*(n_layers+3) + l; + // Calculate orientation histogram + for (int l = 0; l < len*len; l++) { + int i = l / len - radius; + int j = l % len - radius; - if (o == 0 && l == 0) { - gauss_pyr[idx] = init_img; - } - else if (l == 0) { - af::dim4 sdims = gauss_pyr[src_idx].dims(); - gauss_pyr[idx] = resize(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR); - } - else { - Array filter = gauss_filter(sig_layers[l]); + int y = fy + i; + int x = fx + j; + + float x_rot = (j * cos_t - i * sin_t); + float y_rot = (j * sin_t + i * cos_t); + + float r = sqrt(x_rot*x_rot + y_rot*y_rot) / radius * GLOHRadii[rb-1]; + float theta = atan2(y_rot, x_rot); + while (theta < 0.0f) + theta += PI_VAL*2; + while (theta >= PI_VAL*2) + theta -= PI_VAL*2; + + float tbin = theta * polar_bins_per_rad; + float rbin = (r < GLOHRadii[0]) ? r / GLOHRadii[0] : + ((r < GLOHRadii[1]) ? 1 + (r - GLOHRadii[0]) / (float)(GLOHRadii[1] - GLOHRadii[0]) : + min(2 + (r - GLOHRadii[1]) / (float)(GLOHRadii[2] - GLOHRadii[1]), 3.f-FLT_EPSILON)); - gauss_pyr[idx] = convolve2(gauss_pyr[src_idx], filter, filter); + if (r <= GLOHRadii[rb-1] && + y > 0 && y < idims[0] - 1 && x > 0 && x < idims[1] - 1) { + float dx = (float)(IPTR(x+1, y) - IPTR(x-1, y)); + float dy = (float)(IPTR(x, y-1) - IPTR(x, y+1)); + + float grad_mag = sqrt(dx*dx + dy*dy); + float grad_ori = atan2(dy, dx) - ori; + while (grad_ori < 0.0f) + grad_ori += PI_VAL*2; + while (grad_ori >= PI_VAL*2) + grad_ori -= PI_VAL*2; + + float w = exp(-r / exp_denom); + float obin = grad_ori * hist_bins_per_rad; + float mag = grad_mag*w; + + int t0 = floor(tbin); + int r0 = floor(rbin); + int o0 = floor(obin); + tbin -= t0; + rbin -= r0; + obin -= o0; + + for (int rl = 0; rl <= 1; rl++) { + int rb = (rbin > 0.5f) ? (r0 + rl) : (r0 - rl); + float v_r = mag * ((rl == 0) ? 1.0f - rbin : rbin); + if (rb >= 0 && rb <= 2) { + for (int tl = 0; tl <= 1; tl++) { + int tb = (t0 + tl) % ab; + float v_t = v_r * ((tl == 0) ? 1.0f - tbin : tbin); + for (int ol = 0; ol <= 1; ol++) { + int ob = (o0 + ol) % hb; + float v_o = v_t * ((ol == 0) ? 1.0f - obin : obin); + unsigned idx = (rb > 0) * (hb + ((rb-1) * ab + tb)*hb) + ob; + desc[idx] += v_o; + } + } + } } } } - return gauss_pyr; - } + normalizeDesc(desc, desc_len); - template - std::vector< Array > buildDoGPyr( - std::vector< Array >& gauss_pyr, - const unsigned n_octaves, - const unsigned n_layers) - { - // DoG Pyramid - std::vector< Array > dog_pyr(n_octaves * (n_layers+2), createEmptyArray(af::dim4())); - for (unsigned o = 0; o < n_octaves; o++) { - for (unsigned l = 0; l < n_layers+2; l++) { - unsigned idx = o*(n_layers+2) + l; - unsigned bottom = o*(n_layers+3) + l; - unsigned top = o*(n_layers+3) + l+1; + for (int i = 0; i < (int)desc_len; i++) + desc[i] = min(desc[i], DescrMagThr); - dog_pyr[idx] = createEmptyArray(gauss_pyr[bottom].dims()); + normalizeDesc(desc, desc_len); - sub(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]); - } + // Calculate final descriptor values + for (int k = 0; k < (int)desc_len; k++) { + desc_out[f*desc_len+k] = round(min(255.f, desc[k] * IntDescrFctr)); } - - return dog_pyr; } +} +#undef IPTR - template - unsigned sift_impl(Array& x, Array& y, Array& score, - Array& ori, Array& size, Array& desc, - const Array& in, const unsigned n_layers, - const float contrast_thr, const float edge_thr, - const float init_sigma, const bool double_input, - const float img_scale, const float feature_ratio, - const bool compute_GLOH) - { - in.eval(); - af::dim4 idims = in.dims(); +template +Array createInitialImage( + const Array& img, + const float init_sigma, + const bool double_input) +{ + af::dim4 idims = img.dims(); - const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2) - : min(idims[0], idims[1]); - const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2; + Array init_img = createEmptyArray(af::dim4()); - Array init_img = createInitialImage(in, init_sigma, double_input); + float s = (double_input) ? std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma * 4), 0.1f) + : std::max((float)sqrt(init_sigma * init_sigma - InitSigma * InitSigma), 0.1f); - std::vector< Array > gauss_pyr = buildGaussPyr(init_img, n_octaves, n_layers, init_sigma); + Array filter = gauss_filter(s); - std::vector< Array > dog_pyr = buildDoGPyr(gauss_pyr, n_octaves, n_layers); + if (double_input) { + Array double_img = resize(img, idims[0] * 2, idims[1] * 2, AF_INTERP_BILINEAR); + init_img = convolve2(double_img, filter, filter); + } + else { + init_img = convolve2(img, filter, filter); + } - std::vector x_pyr(n_octaves, NULL); - std::vector y_pyr(n_octaves, NULL); - std::vector response_pyr(n_octaves, NULL); - std::vector size_pyr(n_octaves, NULL); - std::vector ori_pyr(n_octaves, NULL); - std::vector desc_pyr(n_octaves, NULL); - std::vector feat_pyr(n_octaves, 0); - unsigned total_feat = 0; + return init_img; +} - const unsigned d = DescrWidth; - const unsigned n = DescrHistBins; - const unsigned rb = GLOHRadialBins; - const unsigned ab = GLOHAngularBins; - const unsigned hb = GLOHHistBins; - const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n; +template +std::vector< Array > buildGaussPyr( + const Array& init_img, + const unsigned n_octaves, + const unsigned n_layers, + const float init_sigma) +{ + // Precompute Gaussian sigmas using the following formula: + // \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2 + std::vector sig_layers(n_layers + 3); + sig_layers[0] = init_sigma; + float k = std::pow(2.0f, 1.0f / n_layers); + for (unsigned i = 1; i < n_layers + 3; i++) { + float sig_prev = std::pow(k, i-1) * init_sigma; + float sig_total = sig_prev * k; + sig_layers[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev); + } - for (unsigned i = 0; i < n_octaves; i++) { - af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims(); - if (ddims[0]-2*ImgBorder < 1 || - ddims[1]-2*ImgBorder < 1) - continue; + // Gaussian Pyramid + std::vector< Array > gauss_pyr(n_octaves * (n_layers+3), createEmptyArray(af::dim4())); + for (unsigned o = 0; o < n_octaves; o++) { + for (unsigned l = 0; l < n_layers+3; l++) { + unsigned src_idx = (l == 0) ? (o-1)*(n_layers+3) + n_layers : o*(n_layers+3) + l-1; + unsigned idx = o*(n_layers+3) + l; - const unsigned imel = ddims[0] * ddims[1]; - const unsigned max_feat = ceil(imel * feature_ratio); + if (o == 0 && l == 0) { + gauss_pyr[idx] = init_img; + } + else if (l == 0) { + af::dim4 sdims = gauss_pyr[src_idx].dims(); + gauss_pyr[idx] = resize(gauss_pyr[src_idx], sdims[0] / 2, sdims[1] / 2, AF_INTERP_BILINEAR); + } + else { + Array filter = gauss_filter(sig_layers[l]); - float* extrema_x = memAlloc(max_feat); - float* extrema_y = memAlloc(max_feat); - unsigned* extrema_layer = memAlloc(max_feat); - unsigned extrema_feat = 0; + gauss_pyr[idx] = convolve2(gauss_pyr[src_idx], filter, filter); + } + } + } - for (unsigned j = 1; j <= n_layers; j++) { - unsigned prev = i*(n_layers+2) + j-1; - unsigned center = i*(n_layers+2) + j; - unsigned next = i*(n_layers+2) + j+1; + return gauss_pyr; +} - unsigned layer = j; +template +std::vector< Array > buildDoGPyr( + std::vector< Array >& gauss_pyr, + const unsigned n_octaves, + const unsigned n_layers) +{ + // DoG Pyramid + std::vector< Array > dog_pyr(n_octaves * (n_layers+2), createEmptyArray(af::dim4())); + for (unsigned o = 0; o < n_octaves; o++) { + for (unsigned l = 0; l < n_layers+2; l++) { + unsigned idx = o*(n_layers+2) + l; + unsigned bottom = o*(n_layers+3) + l; + unsigned top = o*(n_layers+3) + l+1; - float extrema_thr = 0.5f * contrast_thr / n_layers; - detectExtrema(extrema_x, extrema_y, extrema_layer, &extrema_feat, - dog_pyr[prev], dog_pyr[center], dog_pyr[next], - layer, max_feat, extrema_thr); - } + dog_pyr[idx] = createEmptyArray(gauss_pyr[bottom].dims()); - extrema_feat = min(extrema_feat, max_feat); + sub(dog_pyr[idx], gauss_pyr[top], gauss_pyr[bottom]); + } + } - if (extrema_feat == 0) { - memFree(extrema_x); - memFree(extrema_y); - memFree(extrema_layer); + return dog_pyr; +} - continue; - } - unsigned interp_feat = 0; +template +unsigned sift_impl(Array& x, Array& y, Array& score, + Array& ori, Array& size, Array& desc, + const Array& in, const unsigned n_layers, + const float contrast_thr, const float edge_thr, + const float init_sigma, const bool double_input, + const float img_scale, const float feature_ratio, + const bool compute_GLOH) +{ + in.eval(); + af::dim4 idims = in.dims(); + + const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2) + : min(idims[0], idims[1]); + const unsigned n_octaves = floor(log(min_dim) / log(2)) - 2; + + Array init_img = createInitialImage(in, init_sigma, double_input); + + std::vector< Array > gauss_pyr = buildGaussPyr(init_img, n_octaves, n_layers, init_sigma); + + std::vector< Array > dog_pyr = buildDoGPyr(gauss_pyr, n_octaves, n_layers); + + std::vector x_pyr(n_octaves, NULL); + std::vector y_pyr(n_octaves, NULL); + std::vector response_pyr(n_octaves, NULL); + std::vector size_pyr(n_octaves, NULL); + std::vector ori_pyr(n_octaves, NULL); + std::vector desc_pyr(n_octaves, NULL); + std::vector feat_pyr(n_octaves, 0); + unsigned total_feat = 0; + + const unsigned d = DescrWidth; + const unsigned n = DescrHistBins; + const unsigned rb = GLOHRadialBins; + const unsigned ab = GLOHAngularBins; + const unsigned hb = GLOHHistBins; + const unsigned desc_len = (compute_GLOH) ? (1 + (rb-1) * ab) * hb : d*d*n; + + for (unsigned i = 0; i < n_octaves; i++) { + af::dim4 ddims = dog_pyr[i*(n_layers+2)].dims(); + if (ddims[0]-2*ImgBorder < 1 || + ddims[1]-2*ImgBorder < 1) + continue; + + const unsigned imel = ddims[0] * ddims[1]; + const unsigned max_feat = ceil(imel * feature_ratio); + + float* extrema_x = memAlloc(max_feat); + float* extrema_y = memAlloc(max_feat); + unsigned* extrema_layer = memAlloc(max_feat); + unsigned extrema_feat = 0; + + for (unsigned j = 1; j <= n_layers; j++) { + unsigned prev = i*(n_layers+2) + j-1; + unsigned center = i*(n_layers+2) + j; + unsigned next = i*(n_layers+2) + j+1; + + unsigned layer = j; + + float extrema_thr = 0.5f * contrast_thr / n_layers; + detectExtrema(extrema_x, extrema_y, extrema_layer, &extrema_feat, + dog_pyr[prev], dog_pyr[center], dog_pyr[next], + layer, max_feat, extrema_thr); + } + + extrema_feat = min(extrema_feat, max_feat); - float* interp_x = memAlloc(extrema_feat); - float* interp_y = memAlloc(extrema_feat); - unsigned* interp_layer = memAlloc(extrema_feat); - float* interp_response = memAlloc(extrema_feat); - float* interp_size = memAlloc(extrema_feat); + if (extrema_feat == 0) { + memFree(extrema_x); + memFree(extrema_y); + memFree(extrema_layer); - interpolateExtrema(interp_x, interp_y, interp_layer, - interp_response, interp_size, &interp_feat, - extrema_x, extrema_y, extrema_layer, extrema_feat, - dog_pyr, max_feat, i, n_layers, - contrast_thr, edge_thr, init_sigma, img_scale); + continue; + } - interp_feat = min(interp_feat, max_feat); + unsigned interp_feat = 0; - if (interp_feat == 0) { - memFree(interp_x); - memFree(interp_y); - memFree(interp_layer); - memFree(interp_response); - memFree(interp_size); + float* interp_x = memAlloc(extrema_feat); + float* interp_y = memAlloc(extrema_feat); + unsigned* interp_layer = memAlloc(extrema_feat); + float* interp_response = memAlloc(extrema_feat); + float* interp_size = memAlloc(extrema_feat); - continue; - } + interpolateExtrema(interp_x, interp_y, interp_layer, + interp_response, interp_size, &interp_feat, + extrema_x, extrema_y, extrema_layer, extrema_feat, + dog_pyr, max_feat, i, n_layers, + contrast_thr, edge_thr, init_sigma, img_scale); - std::vector sorted_feat; - array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat); - std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp); + interp_feat = min(interp_feat, max_feat); + if (interp_feat == 0) { memFree(interp_x); memFree(interp_y); memFree(interp_layer); memFree(interp_response); memFree(interp_size); - unsigned nodup_feat = 0; - - float* nodup_x = memAlloc(interp_feat); - float* nodup_y = memAlloc(interp_feat); - unsigned* nodup_layer = memAlloc(interp_feat); - float* nodup_response = memAlloc(interp_feat); - float* nodup_size = memAlloc(interp_feat); - - removeDuplicates(nodup_x, nodup_y, nodup_layer, - nodup_response, nodup_size, &nodup_feat, - sorted_feat); - - const unsigned max_oriented_feat = nodup_feat * 3; - - float* oriented_x = memAlloc(max_oriented_feat); - float* oriented_y = memAlloc(max_oriented_feat); - unsigned* oriented_layer = memAlloc(max_oriented_feat); - float* oriented_response = memAlloc(max_oriented_feat); - float* oriented_size = memAlloc(max_oriented_feat); - float* oriented_ori = memAlloc(max_oriented_feat); - - unsigned oriented_feat = 0; - - calcOrientation(oriented_x, oriented_y, oriented_layer, - oriented_response, oriented_size, oriented_ori, &oriented_feat, - nodup_x, nodup_y, nodup_layer, - nodup_response, nodup_size, nodup_feat, - gauss_pyr, max_oriented_feat, i, n_layers, double_input); - - memFree(nodup_x); - memFree(nodup_y); - memFree(nodup_layer); - memFree(nodup_response); - memFree(nodup_size); - - if (oriented_feat == 0) { - memFree(oriented_x); - memFree(oriented_y); - memFree(oriented_layer); - memFree(oriented_response); - memFree(oriented_size); - memFree(oriented_ori); + continue; + } - continue; - } + std::vector sorted_feat; + array_to_feat(sorted_feat, interp_x, interp_y, interp_layer, interp_response, interp_size, interp_feat); + std::stable_sort(sorted_feat.begin(), sorted_feat.end(), feat_cmp); + + memFree(interp_x); + memFree(interp_y); + memFree(interp_layer); + memFree(interp_response); + memFree(interp_size); + + unsigned nodup_feat = 0; + + float* nodup_x = memAlloc(interp_feat); + float* nodup_y = memAlloc(interp_feat); + unsigned* nodup_layer = memAlloc(interp_feat); + float* nodup_response = memAlloc(interp_feat); + float* nodup_size = memAlloc(interp_feat); + + removeDuplicates(nodup_x, nodup_y, nodup_layer, + nodup_response, nodup_size, &nodup_feat, + sorted_feat); + + const unsigned max_oriented_feat = nodup_feat * 3; + + float* oriented_x = memAlloc(max_oriented_feat); + float* oriented_y = memAlloc(max_oriented_feat); + unsigned* oriented_layer = memAlloc(max_oriented_feat); + float* oriented_response = memAlloc(max_oriented_feat); + float* oriented_size = memAlloc(max_oriented_feat); + float* oriented_ori = memAlloc(max_oriented_feat); + + unsigned oriented_feat = 0; + + calcOrientation(oriented_x, oriented_y, oriented_layer, + oriented_response, oriented_size, oriented_ori, &oriented_feat, + nodup_x, nodup_y, nodup_layer, + nodup_response, nodup_size, nodup_feat, + gauss_pyr, max_oriented_feat, i, n_layers, double_input); + + memFree(nodup_x); + memFree(nodup_y); + memFree(nodup_layer); + memFree(nodup_response); + memFree(nodup_size); + + if (oriented_feat == 0) { + memFree(oriented_x); + memFree(oriented_y); + memFree(oriented_layer); + memFree(oriented_response); + memFree(oriented_size); + memFree(oriented_ori); + + continue; + } - float* desc = memAlloc(oriented_feat * desc_len); + float* desc = memAlloc(oriented_feat * desc_len); - float scale = 1.f/(1 << i); - if (double_input) scale *= 2.f; + float scale = 1.f/(1 << i); + if (double_input) scale *= 2.f; - if (compute_GLOH) - computeGLOHDescriptor(desc, desc_len, - oriented_x, oriented_y, oriented_layer, - oriented_response, oriented_size, oriented_ori, - oriented_feat, gauss_pyr, d, rb, ab, hb, - scale, i, n_layers); - else - computeDescriptor(desc, desc_len, + if (compute_GLOH) + computeGLOHDescriptor(desc, desc_len, oriented_x, oriented_y, oriented_layer, oriented_response, oriented_size, oriented_ori, - oriented_feat, gauss_pyr, d, n, scale, i, n_layers); - - total_feat += oriented_feat; - feat_pyr[i] = oriented_feat; - - if (oriented_feat > 0) { - x_pyr[i] = oriented_x; - y_pyr[i] = oriented_y; - response_pyr[i] = oriented_response; - ori_pyr[i] = oriented_ori; - size_pyr[i] = oriented_size; - desc_pyr[i] = desc; - } + oriented_feat, gauss_pyr, d, rb, ab, hb, + scale, i, n_layers); + else + computeDescriptor(desc, desc_len, + oriented_x, oriented_y, oriented_layer, + oriented_response, oriented_size, oriented_ori, + oriented_feat, gauss_pyr, d, n, scale, i, n_layers); + + total_feat += oriented_feat; + feat_pyr[i] = oriented_feat; + + if (oriented_feat > 0) { + x_pyr[i] = oriented_x; + y_pyr[i] = oriented_y; + response_pyr[i] = oriented_response; + ori_pyr[i] = oriented_ori; + size_pyr[i] = oriented_size; + desc_pyr[i] = desc; } + } - if (total_feat > 0) { - const af::dim4 total_feat_dims(total_feat); - const af::dim4 desc_dims(desc_len, total_feat); - - // Allocate output memory - x = createEmptyArray(total_feat_dims); - y = createEmptyArray(total_feat_dims); - score = createEmptyArray(total_feat_dims); - ori = createEmptyArray(total_feat_dims); - size = createEmptyArray(total_feat_dims); - desc = createEmptyArray(desc_dims); - - float* x_ptr = x.get(); - float* y_ptr = y.get(); - float* score_ptr = score.get(); - float* ori_ptr = ori.get(); - float* size_ptr = size.get(); - float* desc_ptr = desc.get(); - - unsigned offset = 0; - for (unsigned i = 0; i < n_octaves; i++) { - if (feat_pyr[i] == 0) - continue; - - memcpy(x_ptr+offset, x_pyr[i], feat_pyr[i] * sizeof(float)); - memcpy(y_ptr+offset, y_pyr[i], feat_pyr[i] * sizeof(float)); - memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float)); - memcpy(ori_ptr+offset, ori_pyr[i], feat_pyr[i] * sizeof(float)); - memcpy(size_ptr+offset, size_pyr[i], feat_pyr[i] * sizeof(float)); - - memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float)); - - memFree(x_pyr[i]); - memFree(y_pyr[i]); - memFree(response_pyr[i]); - memFree(ori_pyr[i]); - memFree(size_pyr[i]); - memFree(desc_pyr[i]); - - offset += feat_pyr[i]; - } - } + if (total_feat > 0) { + const af::dim4 total_feat_dims(total_feat); + const af::dim4 desc_dims(desc_len, total_feat); + + // Allocate output memory + x = createEmptyArray(total_feat_dims); + y = createEmptyArray(total_feat_dims); + score = createEmptyArray(total_feat_dims); + ori = createEmptyArray(total_feat_dims); + size = createEmptyArray(total_feat_dims); + desc = createEmptyArray(desc_dims); + + float* x_ptr = x.get(); + float* y_ptr = y.get(); + float* score_ptr = score.get(); + float* ori_ptr = ori.get(); + float* size_ptr = size.get(); + float* desc_ptr = desc.get(); + + unsigned offset = 0; + for (unsigned i = 0; i < n_octaves; i++) { + if (feat_pyr[i] == 0) + continue; + + memcpy(x_ptr+offset, x_pyr[i], feat_pyr[i] * sizeof(float)); + memcpy(y_ptr+offset, y_pyr[i], feat_pyr[i] * sizeof(float)); + memcpy(score_ptr+offset, response_pyr[i], feat_pyr[i] * sizeof(float)); + memcpy(ori_ptr+offset, ori_pyr[i], feat_pyr[i] * sizeof(float)); + memcpy(size_ptr+offset, size_pyr[i], feat_pyr[i] * sizeof(float)); + + memcpy(desc_ptr+(offset*desc_len), desc_pyr[i], feat_pyr[i] * desc_len * sizeof(float)); - return total_feat; + memFree(x_pyr[i]); + memFree(y_pyr[i]); + memFree(response_pyr[i]); + memFree(ori_pyr[i]); + memFree(size_pyr[i]); + memFree(desc_pyr[i]); + + offset += feat_pyr[i]; + } } + + return total_feat; +} + } diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp index 9f683fc450..ba47ba9fd6 100644 --- a/src/backend/cpu/sobel.cpp +++ b/src/backend/cpu/sobel.cpp @@ -91,6 +91,7 @@ template std::pair< Array, Array > sobelDerivatives(const Array &img, const unsigned &ker_size) { + img.eval(); // ket_size is for future proofing, this argument is not used // currently Array dx = createEmptyArray(img.dims()); diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp index b279971c7b..0243088fb3 100644 --- a/src/backend/cpu/solve.cpp +++ b/src/backend/cpu/solve.cpp @@ -75,6 +75,10 @@ template Array solveLU(const Array &A, const Array &pivot, const Array &b, const af_mat_prop options) { + A.eval(); + pivot.eval(); + b.eval(); + int N = A.dims()[0]; int NRHS = b.dims()[1]; Array< T > B = copyArray(b); @@ -114,9 +118,10 @@ Array triangleSolve(const Array &A, const Array &b, const af_mat_prop o template Array solve(const Array &a, const Array &b, const af_mat_prop options) { + a.eval(); + b.eval(); - if (options & AF_MAT_UPPER || - options & AF_MAT_LOWER) { + if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) { return triangleSolve(a, b, options); } @@ -178,6 +183,7 @@ Array solve(const Array &a, const Array &b, const af_mat_prop options) namespace cpu { + #define INSTANTIATE_SOLVE(T) \ template Array solve(const Array &a, const Array &b, \ const af_mat_prop options); \ @@ -188,4 +194,5 @@ INSTANTIATE_SOLVE(float) INSTANTIATE_SOLVE(cfloat) INSTANTIATE_SOLVE(double) INSTANTIATE_SOLVE(cdouble) + } diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index 94d70a8e49..cbdb50e987 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -25,65 +25,69 @@ using std::function; namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - // Based off of http://stackoverflow.com/a/12399290 - template - void sort0(Array val) - { - // initialize original index locations - T *val_ptr = val.get(); +/////////////////////////////////////////////////////////////////////////// +// Kernel Functions +/////////////////////////////////////////////////////////////////////////// - function op = greater(); - if(isAscending) { op = less(); } +// Based off of http://stackoverflow.com/a/12399290 +template +void sort0(Array val) +{ + // initialize original index locations + T *val_ptr = val.get(); + + function op = greater(); + if(isAscending) { op = less(); } - T *comp_ptr = nullptr; - for(dim_t w = 0; w < val.dims()[3]; w++) { - dim_t valW = w * val.strides()[3]; - for(dim_t z = 0; z < val.dims()[2]; z++) { - dim_t valWZ = valW + z * val.strides()[2]; - for(dim_t y = 0; y < val.dims()[1]; y++) { + T *comp_ptr = nullptr; + for(dim_t w = 0; w < val.dims()[3]; w++) { + dim_t valW = w * val.strides()[3]; + for(dim_t z = 0; z < val.dims()[2]; z++) { + dim_t valWZ = valW + z * val.strides()[2]; + for(dim_t y = 0; y < val.dims()[1]; y++) { - dim_t valOffset = valWZ + y * val.strides()[1]; + dim_t valOffset = valWZ + y * val.strides()[1]; - comp_ptr = val_ptr + valOffset; - std::sort(comp_ptr, comp_ptr + val.dims()[0], op); - } + comp_ptr = val_ptr + valOffset; + std::sort(comp_ptr, comp_ptr + val.dims()[0], op); } } - return; } + return; +} - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - Array sort(const Array &in, const unsigned dim) - { - Array out = copyArray(in); - switch(dim) { - case 0: getQueue().enqueue(sort0, out); break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); - } - return out; +/////////////////////////////////////////////////////////////////////////// +// Wrapper Functions +/////////////////////////////////////////////////////////////////////////// +template +Array sort(const Array &in, const unsigned dim) +{ + in.eval(); + + Array out = copyArray(in); + switch(dim) { + case 0: getQueue().enqueue(sort0, out); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + return out; +} #define INSTANTIATE(T) \ template Array sort(const Array &in, const unsigned dim); \ template Array sort(const Array &in, const unsigned dim); \ - INSTANTIATE(float) - INSTANTIATE(double) - //INSTANTIATE(cfloat) - //INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) - INSTANTIATE(intl) - INSTANTIATE(uintl) +INSTANTIATE(float) +INSTANTIATE(double) +//INSTANTIATE(cfloat) +//INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) +INSTANTIATE(intl) +INSTANTIATE(uintl) + } diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index f07d585b41..f9415345ae 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -23,68 +23,71 @@ using std::sort; namespace cpu { - /////////////////////////////////////////////////////////////////////////// - // Kernel Functions - /////////////////////////////////////////////////////////////////////////// - template - void sort0_index(Array &val, Array &idx, const Array &in) - { - // initialize original index locations - uint *idx_ptr = idx.get(); - T *val_ptr = val.get(); - const T *in_ptr = in.get(); - function op = greater(); - if(isAscending) { op = less(); } - - std::vector seq_vec(idx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); - - const T *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < in.dims()[3]; w++) { - dim_t valW = w * val.strides()[3]; - dim_t idxW = w * idx.strides()[3]; - dim_t inW = w * in.strides()[3]; - for(dim_t z = 0; z < in.dims()[2]; z++) { - dim_t valWZ = valW + z * val.strides()[2]; - dim_t idxWZ = idxW + z * idx.strides()[2]; - dim_t inWZ = inW + z * in.strides()[2]; - for(dim_t y = 0; y < in.dims()[1]; y++) { - - dim_t valOffset = valWZ + y * val.strides()[1]; - dim_t idxOffset = idxWZ + y * idx.strides()[1]; - dim_t inOffset = inWZ + y * in.strides()[1]; - - uint *ptr = idx_ptr + idxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); - - comp_ptr = in_ptr + inOffset; - std::stable_sort(ptr, ptr + in.dims()[0], comparator); - - for (dim_t i = 0; i < val.dims()[0]; ++i){ - val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]]; - } + +/////////////////////////////////////////////////////////////////////////// +// Kernel Functions +/////////////////////////////////////////////////////////////////////////// +template +void sort0_index(Array &val, Array &idx, const Array &in) +{ + // initialize original index locations + uint *idx_ptr = idx.get(); + T *val_ptr = val.get(); + const T *in_ptr = in.get(); + function op = greater(); + if(isAscending) { op = less(); } + + std::vector seq_vec(idx.dims()[0]); + std::iota(seq_vec.begin(), seq_vec.end(), 0); + + const T *comp_ptr = nullptr; + auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; + + for(dim_t w = 0; w < in.dims()[3]; w++) { + dim_t valW = w * val.strides()[3]; + dim_t idxW = w * idx.strides()[3]; + dim_t inW = w * in.strides()[3]; + for(dim_t z = 0; z < in.dims()[2]; z++) { + dim_t valWZ = valW + z * val.strides()[2]; + dim_t idxWZ = idxW + z * idx.strides()[2]; + dim_t inWZ = inW + z * in.strides()[2]; + for(dim_t y = 0; y < in.dims()[1]; y++) { + + dim_t valOffset = valWZ + y * val.strides()[1]; + dim_t idxOffset = idxWZ + y * idx.strides()[1]; + dim_t inOffset = inWZ + y * in.strides()[1]; + + uint *ptr = idx_ptr + idxOffset; + std::copy(seq_vec.begin(), seq_vec.end(), ptr); + + comp_ptr = in_ptr + inOffset; + std::stable_sort(ptr, ptr + in.dims()[0], comparator); + + for (dim_t i = 0; i < val.dims()[0]; ++i){ + val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]]; } } } - - return; } - /////////////////////////////////////////////////////////////////////////// - // Wrapper Functions - /////////////////////////////////////////////////////////////////////////// - template - void sort_index(Array &val, Array &idx, const Array &in, const uint dim) - { - val = createEmptyArray(in.dims()); - idx = createEmptyArray(in.dims()); - switch(dim) { - case 0: getQueue().enqueue(sort0_index, val, idx, in); break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); - } + return; +} + +/////////////////////////////////////////////////////////////////////////// +// Wrapper Functions +/////////////////////////////////////////////////////////////////////////// +template +void sort_index(Array &val, Array &idx, const Array &in, const uint dim) +{ + in.eval(); + + val = createEmptyArray(in.dims()); + idx = createEmptyArray(in.dims()); + switch(dim) { + case 0: getQueue().enqueue(sort0_index, val, idx, in); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } +} #define INSTANTIATE(T) \ template void sort_index(Array &val, Array &idx, const Array &in, \ @@ -92,16 +95,17 @@ namespace cpu template void sort_index(Array &val, Array &idx, const Array &in, \ const uint dim); \ - INSTANTIATE(float) - INSTANTIATE(double) - //INSTANTIATE(cfloat) - //INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) - INSTANTIATE(intl) - INSTANTIATE(uintl) +INSTANTIATE(float) +INSTANTIATE(double) +//INSTANTIATE(cfloat) +//INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) +INSTANTIATE(intl) +INSTANTIATE(uintl) + } diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp index e2c908c378..c278908e40 100644 --- a/src/backend/cpu/susan.cpp +++ b/src/backend/cpu/susan.cpp @@ -106,6 +106,8 @@ unsigned susan(Array &x_out, Array &y_out, Array &resp_out, const unsigned radius, const float diff_thr, const float geom_thr, const float feature_ratio, const unsigned edge) { + in.eval(); + dim4 idims = in.dims(); const unsigned corner_lim = in.elements() * feature_ratio; diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp index 39cbb66343..92912ca616 100644 --- a/src/backend/cpu/svd.cpp +++ b/src/backend/cpu/svd.cpp @@ -30,101 +30,106 @@ namespace cpu #if defined(USE_MKL) || defined(__APPLE__) - template - using svd_func_def = int (*)(ORDER_TYPE, - char jobz, - int m, int n, - T* in, int ldin, - Tr* s, - T* u, int ldu, - T* vt, int ldvt); - - SVD_FUNC_DEF( gesdd ) - SVD_FUNC(gesdd, float , float , s) - SVD_FUNC(gesdd, double , double, d) - SVD_FUNC(gesdd, cfloat , float , c) - SVD_FUNC(gesdd, cdouble, double, z) +template +using svd_func_def = int (*)(ORDER_TYPE, + char jobz, + int m, int n, + T* in, int ldin, + Tr* s, + T* u, int ldu, + T* vt, int ldvt); + +SVD_FUNC_DEF( gesdd ) +SVD_FUNC(gesdd, float , float , s) +SVD_FUNC(gesdd, double , double, d) +SVD_FUNC(gesdd, cfloat , float , c) +SVD_FUNC(gesdd, cdouble, double, z) #else // Atlas causes memory freeing issues with using gesdd - template - using svd_func_def = int (*)(ORDER_TYPE, - char jobu, char jobvt, - int m, int n, - T* in, int ldin, - Tr* s, - T* u, int ldu, - T* vt, int ldvt, - Tr *superb); - - SVD_FUNC_DEF( gesvd ) - SVD_FUNC(gesvd, float , float , s) - SVD_FUNC(gesvd, double , double, d) - SVD_FUNC(gesvd, cfloat , float , c) - SVD_FUNC(gesvd, cdouble, double, z) +template +using svd_func_def = int (*)(ORDER_TYPE, + char jobu, char jobvt, + int m, int n, + T* in, int ldin, + Tr* s, + T* u, int ldu, + T* vt, int ldvt, + Tr *superb); + +SVD_FUNC_DEF( gesvd ) +SVD_FUNC(gesvd, float , float , s) +SVD_FUNC(gesvd, double , double, d) +SVD_FUNC(gesvd, cfloat , float , c) +SVD_FUNC(gesvd, cdouble, double, z) #endif - template - void svdInPlace(Array &s, Array &u, Array &vt, Array &in) - { - s.eval(); - u.eval(); - vt.eval(); - in.eval(); +template +void svdInPlace(Array &s, Array &u, Array &vt, Array &in) +{ + s.eval(); + u.eval(); + vt.eval(); + in.eval(); - auto func = [=] (Array s, Array u, Array vt, Array in) { - dim4 iDims = in.dims(); - int M = iDims[0]; - int N = iDims[1]; + auto func = [=] (Array s, Array u, Array vt, Array in) { + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; #if defined(USE_MKL) || defined(__APPLE__) - svd_func()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1], - s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]); + svd_func()(AF_LAPACK_COL_MAJOR, 'A', M, N, in.get(), in.strides()[1], + s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1]); #else - std::vector superb(std::min(M, N)); - svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1], - s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]); + std::vector superb(std::min(M, N)); + svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, in.get(), in.strides()[1], + s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]); #endif - }; - getQueue().enqueue(func, s, u, vt, in); - } - - template - void svd(Array &s, Array &u, Array &vt, const Array &in) - { - Array in_copy = copyArray(in); - svdInPlace(s, u, vt, in_copy); - } + }; + getQueue().enqueue(func, s, u, vt, in); +} + +template +void svd(Array &s, Array &u, Array &vt, const Array &in) +{ + Array in_copy = copyArray(in); + svdInPlace(s, u, vt, in_copy); +} + } #else namespace cpu { - template - void svd(Array &s, Array &u, Array &vt, const Array &in) - { - AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); - } - - template - void svdInPlace(Array &s, Array &u, Array &vt, Array &in) - { - AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); - } + +template +void svd(Array &s, Array &u, Array &vt, const Array &in) +{ + AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); +} + +template +void svdInPlace(Array &s, Array &u, Array &vt, Array &in) +{ + AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); +} + } #endif -namespace cpu { +namespace cpu +{ #define INSTANTIATE_SVD(T, Tr) \ template void svd(Array & s, Array & u, Array & vt, const Array &in); \ template void svdInPlace(Array & s, Array & u, Array & vt, Array &in); - INSTANTIATE_SVD(float , float ) - INSTANTIATE_SVD(double , double) - INSTANTIATE_SVD(cfloat , float ) - INSTANTIATE_SVD(cdouble, double) +INSTANTIATE_SVD(float , float ) +INSTANTIATE_SVD(double , double) +INSTANTIATE_SVD(cfloat , float ) +INSTANTIATE_SVD(cdouble, double) + } diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp index f4a05148c5..a7287ceea0 100644 --- a/src/backend/cpu/transform.cpp +++ b/src/backend/cpu/transform.cpp @@ -107,8 +107,10 @@ template Array transform(const Array &in, const Array &transform, const af::dim4 &odims, const af_interp_type method, const bool inverse) { - Array out = createEmptyArray(odims); in.eval(); + transform.eval(); + + Array out = createEmptyArray(odims); switch(method) { case AF_INTERP_NEAREST : diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp index c3a8a37a72..7e7eec1747 100644 --- a/src/backend/cpu/transpose.cpp +++ b/src/backend/cpu/transpose.cpp @@ -171,5 +171,4 @@ INSTANTIATE(uintl ) INSTANTIATE(short) INSTANTIATE(ushort) - } diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp index ed7f348bad..13bee164eb 100644 --- a/src/backend/cpu/triangle.cpp +++ b/src/backend/cpu/triangle.cpp @@ -66,6 +66,7 @@ void triangle(Array &out, const Array &in) template Array triangle(const Array &in) { + in.eval(); Array out = createEmptyArray(in.dims()); triangle(out, in); return out; @@ -81,17 +82,17 @@ Array triangle(const Array &in) template Array triangle(const Array &in); \ template Array triangle(const Array &in); \ - INSTANTIATE(float) - INSTANTIATE(double) - INSTANTIATE(cfloat) - INSTANTIATE(cdouble) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) - INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(short) - INSTANTIATE(ushort) +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(cfloat) +INSTANTIATE(cdouble) +INSTANTIATE(int) +INSTANTIATE(uint) +INSTANTIATE(intl) +INSTANTIATE(uintl) +INSTANTIATE(char) +INSTANTIATE(uchar) +INSTANTIATE(short) +INSTANTIATE(ushort) } diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp index efb46be7f4..41423c746c 100644 --- a/src/backend/cpu/unwrap.cpp +++ b/src/backend/cpu/unwrap.cpp @@ -83,8 +83,9 @@ template Array unwrap(const Array &in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column) { - af::dim4 idims = in.dims(); + in.eval(); + af::dim4 idims = in.dims(); dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; dim_t ny = (idims[1] + 2 * py - wy) / sy + 1; diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp index e6a4817f31..441c7ff239 100644 --- a/src/backend/cpu/where.cpp +++ b/src/backend/cpu/where.cpp @@ -23,61 +23,62 @@ using af::dim4; namespace cpu { - template - Array where(const Array &in) - { - evalArray(in); - getQueue().sync(); - const dim_t *dims = in.dims().get(); - const dim_t *strides = in.strides().get(); - static const T zero = scalar(0); +template +Array where(const Array &in) +{ + evalArray(in); + getQueue().sync(); + + const dim_t *dims = in.dims().get(); + const dim_t *strides = in.strides().get(); + static const T zero = scalar(0); - const T *iptr = in.get(); - uint *out_vec = memAlloc(in.elements()); + const T *iptr = in.get(); + uint *out_vec = memAlloc(in.elements()); - dim_t count = 0; - dim_t idx = 0; - for (dim_t w = 0; w < dims[3]; w++) { - uint offw = w * strides[3]; + dim_t count = 0; + dim_t idx = 0; + for (dim_t w = 0; w < dims[3]; w++) { + uint offw = w * strides[3]; - for (dim_t z = 0; z < dims[2]; z++) { - uint offz = offw + z * strides[2]; + for (dim_t z = 0; z < dims[2]; z++) { + uint offz = offw + z * strides[2]; - for (dim_t y = 0; y < dims[1]; y++) { - uint offy = y * strides[1] + offz; + for (dim_t y = 0; y < dims[1]; y++) { + uint offy = y * strides[1] + offz; - for (dim_t x = 0; x < dims[0]; x++) { + for (dim_t x = 0; x < dims[0]; x++) { - T val = iptr[offy + x]; - if (val != zero) { - out_vec[count] = idx; - count++; - } - idx++; + T val = iptr[offy + x]; + if (val != zero) { + out_vec[count] = idx; + count++; } + idx++; } } } - - Array out = createDeviceDataArray(dim4(count), out_vec); - return out; } + Array out = createDeviceDataArray(dim4(count), out_vec); + return out; +} + #define INSTANTIATE(T) \ template Array where(const Array &in); \ - INSTANTIATE(float ) - INSTANTIATE(cfloat ) - INSTANTIATE(double ) - INSTANTIATE(cdouble) - INSTANTIATE(char ) - INSTANTIATE(int ) - INSTANTIATE(uint ) - INSTANTIATE(intl ) - INSTANTIATE(uintl ) - INSTANTIATE(uchar ) - INSTANTIATE(short ) - INSTANTIATE(ushort ) +INSTANTIATE(float ) +INSTANTIATE(cfloat ) +INSTANTIATE(double ) +INSTANTIATE(cdouble) +INSTANTIATE(char ) +INSTANTIATE(int ) +INSTANTIATE(uint ) +INSTANTIATE(intl ) +INSTANTIATE(uintl ) +INSTANTIATE(uchar ) +INSTANTIATE(short ) +INSTANTIATE(ushort ) } From 8cc9c9cd4f4b23a0ddb58e3feee55c3eccd0b6be Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 17 Dec 2015 18:04:12 -0500 Subject: [PATCH 072/288] threads library is now a submodule in cpu backend --- .gitmodules | 3 +++ src/backend/cpu/CMakeLists.txt | 15 +-------------- src/backend/cpu/threads | 1 + 3 files changed, 5 insertions(+), 14 deletions(-) create mode 160000 src/backend/cpu/threads diff --git a/.gitmodules b/.gitmodules index 395881a861..1d89315347 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "test/gtest"] path = test/gtest url = https://chromium.googlesource.com/external/googletest +[submodule "src/backend/cpu/threads"] + path = src/backend/cpu/threads + url = git@github.com:alltheflops/threads.git diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index 57cf3dfe61..62f0b3a55e 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -47,24 +47,12 @@ IF(NOT UNIX) ADD_DEFINITIONS(-DAFDLL) ENDIF() -INCLUDE(ExternalProject) -ExternalProject_Add( - threads - PREFIX ${CMAKE_BINARY_DIR}/third_party/threads - GIT_REPOSITORY https://github.com/alltheflops/threads.git - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory - /threads ${CMAKE_BINARY_DIR}/third_party/threads - LOG_DOWNLOAD ON - LOG_INSTALL ON - ) INCLUDE_DIRECTORIES( ${CMAKE_INCLUDE_PATH} "${CMAKE_SOURCE_DIR}/src/backend/cpu" + "${CMAKE_SOURCE_DIR}/src/backend/cpu/threads" ${FFTW_INCLUDES} ${CBLAS_INCLUDE_DIR} - ${CMAKE_BINARY_DIR}/third_party/threads/src/threads ) IF(LAPACK_FOUND) @@ -164,7 +152,6 @@ TARGET_LINK_LIBRARIES(afcpu PRIVATE ${CBLAS_LIBRARIES} PRIVATE ${FFTW_LIBRARIES}) -ADD_DEPENDENCIES(afcpu threads) IF(FORGE_FOUND AND NOT USE_SYSTEM_FORGE) ADD_DEPENDENCIES(afcpu forge) ENDIF() diff --git a/src/backend/cpu/threads b/src/backend/cpu/threads new file mode 160000 index 0000000000..5e778ce0a7 --- /dev/null +++ b/src/backend/cpu/threads @@ -0,0 +1 @@ +Subproject commit 5e778ce0a7f0f80af9d32ea3569df3dbec834f59 From abce1e8bf6e3088d41ed87c07048a337a82242cc Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 17 Dec 2015 18:26:15 -0500 Subject: [PATCH 073/288] Moved fns enqueued onto async queue to separate folder --- src/backend/cpu/approx.cpp | 293 +---------------------------- src/backend/cpu/kernel/approx1.hpp | 141 ++++++++++++++ src/backend/cpu/kernel/approx2.hpp | 169 +++++++++++++++++ 3 files changed, 318 insertions(+), 285 deletions(-) create mode 100644 src/backend/cpu/kernel/approx1.hpp create mode 100644 src/backend/cpu/kernel/approx2.hpp diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp index 7988863d4d..7e65486a66 100644 --- a/src/backend/cpu/approx.cpp +++ b/src/backend/cpu/approx.cpp @@ -9,142 +9,17 @@ #include #include -#include -#include -#include +#include +#include #include #include namespace cpu { -/////////////////////////////////////////////////////////////////////////// -// Approx1 -/////////////////////////////////////////////////////////////////////////// -template -struct approx1_op -{ - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - return; - } -}; - -template -struct approx1_op -{ - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idx; - if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - - const Tp x = pos[pmId]; - bool gFlag = false; - if (x < 0 || idims[0] < x+1) { // No need to check y - gFlag = true; - } - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] - + idy * istrides[1]; - const dim_t iMem = round(x) + ioff; - - out[omId] = in[iMem]; - } - } -}; - -template -struct approx1_op -{ - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idx; - if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - - const Tp x = pos[pmId]; - bool gFlag = false; - if (x < 0 || idims[0] < x+1) { - gFlag = true; - } - - const dim_t grid_x = floor(x); // nearest grid - const Tp off_x = x - grid_x; // fractional offset - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x; - - // Check if x and x + 1 are both valid indices - bool cond = (x < idims[0] - 1); - // Compute Left and Right Weighted Values - Ty yl = ((Tp)1.0 - off_x) * in[ioff]; - Ty yr = cond ? (off_x) * in[ioff + 1] : scalar(0); - Ty yo = yl + yr; - // Compute Weight used - Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x); - // Write final value - out[omId] = (yo / wt); - } - } -}; - -template -void approx1_(Array output, Array const input, - Array const position, float const offGrid) -{ - Ty * out = output.get(); - Ty const * const in = input.get(); - Tp const * const pos = position.get(); - dim4 const odims = output.dims(); - dim4 const idims = input.dims(); - dim4 const pdims = position.dims(); - dim4 const ostrides = output.strides(); - dim4 const istrides = input.strides(); - dim4 const pstrides = position.strides(); - dim_t const oElems = output.elements(); - dim_t const iElems = input.elements(); - - approx1_op op; - bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1); - - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(out, odims, oElems, in, idims, iElems, pos, pdims, - ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w); - } - } - } - } -} - template Array approx1(const Array &in, const Array &pos, - const af_interp_type method, const float offGrid) + const af_interp_type method, const float offGrid) { in.eval(); pos.eval(); @@ -152,16 +27,15 @@ Array approx1(const Array &in, const Array &pos, af::dim4 odims = in.dims(); odims[0] = pos.dims()[0]; - // Create output placeholder Array out = createEmptyArray(odims); switch(method) { case AF_INTERP_NEAREST: - getQueue().enqueue(approx1_, + getQueue().enqueue(kernel::approx1, out, in, pos, offGrid); break; case AF_INTERP_LINEAR: - getQueue().enqueue(approx1_, + getQueue().enqueue(kernel::approx1, out, in, pos, offGrid); break; default: @@ -170,161 +44,10 @@ Array approx1(const Array &in, const Array &pos, return out; } -/////////////////////////////////////////////////////////////////////////// -// Approx2 -/////////////////////////////////////////////////////////////////////////// -template -struct approx2_op -{ - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - return; - } -}; - -template -struct approx2_op -{ - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idy * pstrides[1] + idx; - dim_t qmId = idy * qstrides[1] + idx; - if(pBatch) { - pmId += idw * pstrides[3] + idz * pstrides[2]; - qmId += idw * qstrides[3] + idz * qstrides[2]; - } - - bool gFlag = false; - const Tp x = pos[pmId], y = qos[qmId]; - if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { - gFlag = true; - } - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - const dim_t grid_x = round(x), grid_y = round(y); // nearest grid - const dim_t imId = idw * istrides[3] + idz * istrides[2] + - grid_y * istrides[1] + grid_x; - out[omId] = in[imId]; - } - } -}; - -template -struct approx2_op -{ - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) - { - dim_t pmId = idy * pstrides[1] + idx; - dim_t qmId = idy * qstrides[1] + idx; - if(pBatch) { - pmId += idw * pstrides[3] + idz * pstrides[2]; - qmId += idw * qstrides[3] + idz * qstrides[2]; - } - - bool gFlag = false; - const Tp x = pos[pmId], y = qos[qmId]; - if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { - gFlag = true; - } - - const dim_t grid_x = floor(x), grid_y = floor(y); // nearest grid - const Tp off_x = x - grid_x, off_y = y - grid_y; // fractional offset - - // Check if pVal and pVal + 1 are both valid indices - bool condY = (y < idims[1] - 1); - bool condX = (x < idims[0] - 1); - - // Compute wieghts used - Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y); - Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0; - Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0; - Tp wt11 = (condX && condY) ? (off_x) * (off_y) : 0; - - Tp wt = wt00 + wt10 + wt01 + wt11; - Ty zero = scalar(0); - - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] - + idy * ostrides[1] + idx; - if(gFlag) { - out[omId] = scalar(offGrid); - } else { - dim_t ioff = idw * istrides[3] + idz * istrides[2] - + grid_y * istrides[1] + grid_x; - - // Compute Weighted Values - Ty y00 = wt00 * in[ioff]; - Ty y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero; - Ty y01 = (condX) ? wt01 * in[ioff + 1] : zero; - Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero; - - Ty yo = y00 + y10 + y01 + y11; - - // Write Final Value - out[omId] = (yo / wt); - } - } -}; - -template -void approx2_(Array output, Array const input, - Array const position, Array const qosition, - float const offGrid) -{ - Ty * out = output.get(); - Ty const * const in = input.get(); - Tp const * const pos = position.get(); - Tp const * const qos = qosition.get(); - dim4 const odims = output.dims(); - dim4 const idims = input.dims(); - dim4 const pdims = position.dims(); - dim4 const qdims = qosition.dims(); - dim4 const ostrides = output.strides(); - dim4 const istrides = input.strides(); - dim4 const pstrides = position.strides(); - dim4 const qstrides = qosition.strides(); - dim_t const oElems = output.elements(); - dim_t const iElems = input.elements(); - - approx2_op op; - bool pBatch = !(pdims[2] == 1 && pdims[3] == 1); - - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims, - ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w); - } - } - } - } -} template Array approx2(const Array &in, const Array &pos0, const Array &pos1, - const af_interp_type method, const float offGrid) + const af_interp_type method, const float offGrid) { in.eval(); pos0.eval(); @@ -338,11 +61,11 @@ Array approx2(const Array &in, const Array &pos0, const Array &p switch(method) { case AF_INTERP_NEAREST: - getQueue().enqueue(approx2_, + getQueue().enqueue(kernel::approx2, out, in, pos0, pos1, offGrid); break; case AF_INTERP_LINEAR: - getQueue().enqueue(approx2_, + getQueue().enqueue(kernel::approx2, out, in, pos0, pos1, offGrid); break; default: diff --git a/src/backend/cpu/kernel/approx1.hpp b/src/backend/cpu/kernel/approx1.hpp new file mode 100644 index 0000000000..9dc681c8fa --- /dev/null +++ b/src/backend/cpu/kernel/approx1.hpp @@ -0,0 +1,141 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +namespace kernel +{ + +using af::dim4; +using cpu::scalar; +using cpu::Array; + +template +struct approx1_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, + const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + { + return; + } +}; + +template +struct approx1_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, + const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + { + dim_t pmId = idx; + if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; + + const Tp x = pos[pmId]; + bool gFlag = false; + if (x < 0 || idims[0] < x+1) { // No need to check y + gFlag = true; + } + + const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + + idy * istrides[1]; + const dim_t iMem = round(x) + ioff; + + out[omId] = in[iMem]; + } + } +}; + +template +struct approx1_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, + const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + { + dim_t pmId = idx; + if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; + + const Tp x = pos[pmId]; + bool gFlag = false; + if (x < 0 || idims[0] < x+1) { + gFlag = true; + } + + const dim_t grid_x = floor(x); // nearest grid + const Tp off_x = x - grid_x; // fractional offset + + const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x; + + // Check if x and x + 1 are both valid indices + bool cond = (x < idims[0] - 1); + // Compute Left and Right Weighted Values + Ty yl = ((Tp)1.0 - off_x) * in[ioff]; + Ty yr = cond ? (off_x) * in[ioff + 1] : scalar(0); + Ty yo = yl + yr; + // Compute Weight used + Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x); + // Write final value + out[omId] = (yo / wt); + } + } +}; + +template +void approx1(Array output, Array const input, + Array const position, float const offGrid) +{ + Ty * out = output.get(); + Ty const * const in = input.get(); + Tp const * const pos = position.get(); + dim4 const odims = output.dims(); + dim4 const idims = input.dims(); + dim4 const pdims = position.dims(); + dim4 const ostrides = output.strides(); + dim4 const istrides = input.strides(); + dim4 const pstrides = position.strides(); + dim_t const oElems = output.elements(); + dim_t const iElems = input.elements(); + + approx1_op op; + bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1); + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(out, odims, oElems, in, idims, iElems, pos, pdims, + ostrides, istrides, pstrides, offGrid, pBatch, x, y, z, w); + } + } + } + } +} + +} diff --git a/src/backend/cpu/kernel/approx2.hpp b/src/backend/cpu/kernel/approx2.hpp new file mode 100644 index 0000000000..8f57b5cd64 --- /dev/null +++ b/src/backend/cpu/kernel/approx2.hpp @@ -0,0 +1,169 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +namespace kernel +{ + +using af::dim4; +using cpu::scalar; +using cpu::Array; + +template +struct approx2_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const af::dim4 &pstrides, const af::dim4 &qstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + { + return; + } +}; + +template +struct approx2_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const af::dim4 &pstrides, const af::dim4 &qstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + { + dim_t pmId = idy * pstrides[1] + idx; + dim_t qmId = idy * qstrides[1] + idx; + if(pBatch) { + pmId += idw * pstrides[3] + idz * pstrides[2]; + qmId += idw * qstrides[3] + idz * qstrides[2]; + } + + bool gFlag = false; + const Tp x = pos[pmId], y = qos[qmId]; + if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { + gFlag = true; + } + + const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + const dim_t grid_x = round(x), grid_y = round(y); // nearest grid + const dim_t imId = idw * istrides[3] + idz * istrides[2] + + grid_y * istrides[1] + grid_x; + out[omId] = in[imId]; + } + } +}; + +template +struct approx2_op +{ + void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, + const Ty *in, const af::dim4 &idims, const dim_t iElems, + const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const af::dim4 &pstrides, const af::dim4 &qstrides, + const float offGrid, const bool pBatch, + const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + { + dim_t pmId = idy * pstrides[1] + idx; + dim_t qmId = idy * qstrides[1] + idx; + if(pBatch) { + pmId += idw * pstrides[3] + idz * pstrides[2]; + qmId += idw * qstrides[3] + idz * qstrides[2]; + } + + bool gFlag = false; + const Tp x = pos[pmId], y = qos[qmId]; + if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { + gFlag = true; + } + + const dim_t grid_x = floor(x), grid_y = floor(y); // nearest grid + const Tp off_x = x - grid_x, off_y = y - grid_y; // fractional offset + + // Check if pVal and pVal + 1 are both valid indices + bool condY = (y < idims[1] - 1); + bool condX = (x < idims[0] - 1); + + // Compute wieghts used + Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y); + Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0; + Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0; + Tp wt11 = (condX && condY) ? (off_x) * (off_y) : 0; + + Tp wt = wt00 + wt10 + wt01 + wt11; + Ty zero = scalar(0); + + const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + + idy * ostrides[1] + idx; + if(gFlag) { + out[omId] = scalar(offGrid); + } else { + dim_t ioff = idw * istrides[3] + idz * istrides[2] + + grid_y * istrides[1] + grid_x; + + // Compute Weighted Values + Ty y00 = wt00 * in[ioff]; + Ty y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero; + Ty y01 = (condX) ? wt01 * in[ioff + 1] : zero; + Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero; + + Ty yo = y00 + y10 + y01 + y11; + + // Write Final Value + out[omId] = (yo / wt); + } + } +}; + +template +void approx2(Array output, Array const input, + Array const position, Array const qosition, + float const offGrid) +{ + Ty * out = output.get(); + Ty const * const in = input.get(); + Tp const * const pos = position.get(); + Tp const * const qos = qosition.get(); + dim4 const odims = output.dims(); + dim4 const idims = input.dims(); + dim4 const pdims = position.dims(); + dim4 const qdims = qosition.dims(); + dim4 const ostrides = output.strides(); + dim4 const istrides = input.strides(); + dim4 const pstrides = position.strides(); + dim4 const qstrides = qosition.strides(); + dim_t const oElems = output.elements(); + dim_t const iElems = input.elements(); + + approx2_op op; + bool pBatch = !(pdims[2] == 1 && pdims[3] == 1); + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(out, odims, oElems, in, idims, iElems, pos, pdims, qos, qdims, + ostrides, istrides, pstrides, qstrides, offGrid, pBatch, x, y, z, w); + } + } + } + } +} + +} From 5f2f155f01a714017e190ed9e3564c036889360d Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 17 Dec 2015 19:07:37 -0500 Subject: [PATCH 074/288] Removed obselete fn of evalArray from all backends Array::eval is already available, thus making this function redundant --- src/api/c/data.cpp | 2 +- src/api/c/moddims.cpp | 2 +- src/backend/cpu/Array.cpp | 10 ---------- src/backend/cpu/Array.hpp | 4 ---- src/backend/cpu/where.cpp | 2 +- src/backend/cuda/Array.cpp | 7 ------- src/backend/cuda/Array.hpp | 4 ---- src/backend/cuda/copy.cu | 2 +- src/backend/opencl/Array.cpp | 7 ------- src/backend/opencl/Array.hpp | 4 ---- 10 files changed, 4 insertions(+), 40 deletions(-) diff --git a/src/api/c/data.cpp b/src/api/c/data.cpp index 4d77fb279e..2de2f139e3 100644 --- a/src/api/c/data.cpp +++ b/src/api/c/data.cpp @@ -594,7 +594,7 @@ af_err af_get_numdims(unsigned *nd, const af_array in) template static inline void eval(af_array arr) { - evalArray(getArray(arr)); + getArray(arr).eval(); return; } diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp index bb156ffc2c..4b7a179a95 100644 --- a/src/api/c/moddims.cpp +++ b/src/api/c/moddims.cpp @@ -23,7 +23,7 @@ template Array modDims(const Array& in, const af::dim4 &newDims) { //FIXME: Figure out a better way - evalArray(in); + in.eval(); Array Out = in; diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 9c15bc46c6..64fca1aa6b 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -181,7 +181,6 @@ createEmptyArray(const dim4 &size) template Array *initArray() { return new Array(dim4(0, 0, 0, 0)); } - template Array createNodeArray(const dim4 &dims, Node_ptr node) @@ -203,7 +202,6 @@ createNodeArray(const dim4 &dims, Node_ptr node) return out; } - template Array createSubArray(const Array& parent, const std::vector &index, @@ -240,13 +238,6 @@ destroyArray(Array *A) delete A; } - -template -void evalArray(const Array &A) -{ - A.eval(); -} - template void writeHostDataArray(Array &arr, const T * const data, const size_t bytes) @@ -277,7 +268,6 @@ writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) const std::vector &index, \ bool copy); \ template void destroyArray (Array *A); \ - template void evalArray (const Array &A); \ template Array createNodeArray (const dim4 &size, TNJ::Node_ptr node); \ template void Array::eval(); \ template void Array::eval() const; \ diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 2b9cbb4fed..ece989e2d8 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -65,9 +65,6 @@ namespace cpu const std::vector &index, bool copy=true); - template - void evalArray(const Array &A); - // Creates a new Array object on the heap and returns a reference to it. template void destroyArray(Array *A); @@ -208,7 +205,6 @@ namespace cpu bool copy); friend void destroyArray(Array *arr); - friend void evalArray(const Array &arr); friend void *getDevicePtr(const Array& arr); }; diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp index 441c7ff239..018cbdfc36 100644 --- a/src/backend/cpu/where.cpp +++ b/src/backend/cpu/where.cpp @@ -27,7 +27,7 @@ namespace cpu template Array where(const Array &in) { - evalArray(in); + in.eval(); getQueue().sync(); const dim_t *dims = in.dims().get(); diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 275ea13a99..39cd06c43b 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -228,12 +228,6 @@ namespace cuda delete A; } - template - void evalArray(const Array &A) - { - A.eval(); - } - template void writeHostDataArray(Array &arr, const T * const data, const size_t bytes) @@ -279,7 +273,6 @@ namespace cuda const std::vector &index, \ bool copy); \ template void destroyArray (Array *A); \ - template void evalArray (const Array &A); \ template Array createNodeArray (const dim4 &size, JIT::Node_ptr node); \ template Array::Array(af::dim4 dims, const T * const in_data, \ bool is_device, bool copy_device); \ diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index 598fdfd35e..638b745d09 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -78,9 +78,6 @@ namespace cuda const std::vector &index, bool copy=true); - template - void evalArray(const Array &A); - // Creates a new Array object on the heap and returns a reference to it. template void destroyArray(Array *A); @@ -234,7 +231,6 @@ namespace cuda bool copy); friend void destroyArray(Array *arr); - friend void evalArray(const Array &arr); friend void *getDevicePtr(const Array& arr); }; diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu index 90f9970239..71893b8c16 100644 --- a/src/backend/cuda/copy.cu +++ b/src/backend/cuda/copy.cu @@ -23,7 +23,7 @@ namespace cuda void copyData(T *data, const Array &A) { // FIXME: Merge this with copyArray - evalArray(A); + A.eval(); Array out = A; const T *ptr = NULL; diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index 0860098c9f..207a4b0de7 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -258,12 +258,6 @@ namespace opencl delete A; } - template - void evalArray(const Array &A) - { - A.eval(); - } - template void writeHostDataArray(Array &arr, const T * const data, const size_t bytes) @@ -312,7 +306,6 @@ namespace opencl const std::vector &index, \ bool copy); \ template void destroyArray (Array *A); \ - template void evalArray (const Array &A); \ template Array createNodeArray (const dim4 &size, JIT::Node_ptr node); \ template Array::Array(af::dim4 dims, cl_mem mem, size_t src_offset, bool copy); \ template Array::~Array (); \ diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index 1db0ab6347..5f86d6d0b6 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -69,9 +69,6 @@ namespace opencl const std::vector &index, bool copy=true); - template - void evalArray(const Array &A); - // Creates a new Array object on the heap and returns a reference to it. template void destroyArray(Array *A); @@ -226,7 +223,6 @@ namespace opencl bool copy); friend void destroyArray(Array *arr); - friend void evalArray(const Array &arr); friend void *getDevicePtr(const Array& arr); }; From e651cad87b6a703119587c27626dbd6fc2c404b5 Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 17 Dec 2015 19:32:22 -0500 Subject: [PATCH 075/288] cpu::Array::eval queue work moved to kerenel namespace --- src/backend/cpu/Array.cpp | 38 ++-------------------- src/backend/cpu/Array.hpp | 14 ++++++++ src/backend/cpu/kernel/Array.hpp | 56 ++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 36 deletions(-) create mode 100644 src/backend/cpu/kernel/Array.hpp diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 64fca1aa6b..40d25aca6f 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -77,42 +78,7 @@ void Array::eval() data = std::shared_ptr(memAlloc(elements()), memFree); - auto func = [] (Array in) { - in.setId(getActiveDeviceId()); - T *ptr = in.data.get(); - - dim4 odims = in.dims(); - dim4 ostrs = in.strides(); - - bool is_linear = in.node->isLinear(odims.get()); - - if (is_linear) { - int num = in.elements(); - for (int i = 0; i < num; i++) { - ptr[i] = *(T *)in.node->calc(i); - } - } else { - for (int w = 0; w < (int)odims[3]; w++) { - dim_t offw = w * ostrs[3]; - - for (int z = 0; z < (int)odims[2]; z++) { - dim_t offz = z * ostrs[2] + offw; - - for (int y = 0; y < (int)odims[1]; y++) { - dim_t offy = y * ostrs[1] + offz; - - for (int x = 0; x < (int)odims[0]; x++) { - dim_t id = x + offy; - - ptr[id] = *(T *)in.node->calc(x, y, z, w); - } - } - } - } - } - }; - - getQueue().enqueue(func, *this); + getQueue().enqueue(kernel::evalArray, *this); ready = true; Node_ptr prev = node; diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index ece989e2d8..437c47f786 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -23,6 +23,18 @@ #include #include +// cpu::Array class forward declaration +namespace cpu +{ +template class Array; +} + +// kernel::evalArray fn forward declaration +namespace kernel +{ +template void evalArray(cpu::Array in); +} + namespace cpu { @@ -204,6 +216,8 @@ namespace cpu const std::vector &index, bool copy); + friend void kernel::evalArray(Array in); + friend void destroyArray(Array *arr); friend void *getDevicePtr(const Array& arr); }; diff --git a/src/backend/cpu/kernel/Array.hpp b/src/backend/cpu/kernel/Array.hpp new file mode 100644 index 0000000000..0666d43602 --- /dev/null +++ b/src/backend/cpu/kernel/Array.hpp @@ -0,0 +1,56 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +namespace kernel +{ + +using af::dim4; +using cpu::Array; + +template +void evalArray(Array in) +{ + in.setId(cpu::getActiveDeviceId()); + T *ptr = in.data.get(); + + dim4 odims = in.dims(); + dim4 ostrs = in.strides(); + + bool is_linear = in.node->isLinear(odims.get()); + + if (is_linear) { + int num = in.elements(); + for (int i = 0; i < num; i++) { + ptr[i] = *(T *)in.node->calc(i); + } + } else { + for (int w = 0; w < (int)odims[3]; w++) { + dim_t offw = w * ostrs[3]; + + for (int z = 0; z < (int)odims[2]; z++) { + dim_t offz = z * ostrs[2] + offw; + + for (int y = 0; y < (int)odims[1]; y++) { + dim_t offy = y * ostrs[1] + offz; + + for (int x = 0; x < (int)odims[0]; x++) { + dim_t id = x + offy; + + ptr[id] = *(T *)in.node->calc(x, y, z, w); + } + } + } + } + } +} + +} From 3cddae24f55870d565361de75c1fee55ae2ce19a Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 18 Dec 2015 15:45:36 -0500 Subject: [PATCH 076/288] moved assign cpu async fn to kernel space --- src/backend/cpu/assign.cpp | 83 ++-------------------------- src/backend/cpu/kernel/Array.hpp | 2 +- src/backend/cpu/kernel/assign.hpp | 91 +++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 79 deletions(-) create mode 100644 src/backend/cpu/kernel/assign.hpp diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp index c5d733bb17..95bb7e5dd4 100644 --- a/src/backend/cpu/assign.cpp +++ b/src/backend/cpu/assign.cpp @@ -12,31 +12,16 @@ #include #include #include +#include #include -#include #include #include -using af::dim4; -using std::ref; -using std::copy; -using std::vector; - namespace cpu { -static inline -dim_t trimIndex(int idx, const dim_t &len) -{ - int ret_val = idx; - int offset = abs(ret_val)%len; - if (ret_val<0) { - ret_val = offset-1; - } else if (ret_val>=(int)len) { - ret_val = len-offset-1; - } - return ret_val; -} +using af::dim4; +using std::vector; template void assign(Array& out, const af_index_t idxrs[], const Array& rhs) @@ -63,66 +48,8 @@ void assign(Array& out, const af_index_t idxrs[], const Array& rhs) } } - auto func = [=] (Array out, const Array rhs, - const vector isSeq, - const vector seqs, - const vector< Array > idxArrs) { - - dim4 dDims = out.getDataDims(); - dim4 pDims = out.dims(); - // retrieve dimensions & strides for array to which rhs is being copied to - dim4 dst_offsets = toOffset(seqs, dDims); - dim4 dst_strides = toStride(seqs, dDims); - // retrieve rhs array dimenesions & strides - dim4 src_dims = rhs.dims(); - dim4 src_strides = rhs.strides(); - // declare pointers to af_array index data - const uint* ptr0 = idxArrs[0].get(); - const uint* ptr1 = idxArrs[1].get(); - const uint* ptr2 = idxArrs[2].get(); - const uint* ptr3 = idxArrs[3].get(); - - const T * src= rhs.get(); - T * dst = out.get(); - - for(dim_t l=0; l, out, rhs, std::move(isSeq), + std::move(seqs), std::move(idxArrs)); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/kernel/Array.hpp b/src/backend/cpu/kernel/Array.hpp index 0666d43602..b3a02004d1 100644 --- a/src/backend/cpu/kernel/Array.hpp +++ b/src/backend/cpu/kernel/Array.hpp @@ -1,5 +1,5 @@ /******************************************************* - * Copyright (c) 2014, ArrayFire + * Copyright (c) 2015, ArrayFire * All rights reserved. * * This file is distributed under 3-clause BSD license. diff --git a/src/backend/cpu/kernel/assign.hpp b/src/backend/cpu/kernel/assign.hpp new file mode 100644 index 0000000000..16a623f704 --- /dev/null +++ b/src/backend/cpu/kernel/assign.hpp @@ -0,0 +1,91 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +namespace kernel +{ + +using af::dim4; +using cpu::Array; +using std::vector; + +inline +dim_t trimIndex(int idx, const dim_t &len) +{ + int ret_val = idx; + int offset = abs(ret_val)%len; + if (ret_val<0) { + ret_val = offset-1; + } else if (ret_val>=(int)len) { + ret_val = len-offset-1; + } + return ret_val; +} + +template +void assign(Array out, const Array rhs, const vector isSeq, + const vector seqs, const vector< Array > idxArrs) +{ + dim4 dDims = out.getDataDims(); + dim4 pDims = out.dims(); + // retrieve dimensions & strides for array to which rhs is being copied to + dim4 dst_offsets = toOffset(seqs, dDims); + dim4 dst_strides = toStride(seqs, dDims); + // retrieve rhs array dimenesions & strides + dim4 src_dims = rhs.dims(); + dim4 src_strides = rhs.strides(); + // declare pointers to af_array index data + const uint* ptr0 = idxArrs[0].get(); + const uint* ptr1 = idxArrs[1].get(); + const uint* ptr2 = idxArrs[2].get(); + const uint* ptr3 = idxArrs[3].get(); + + const T * src= rhs.get(); + T * dst = out.get(); + + for(dim_t l=0; l Date: Fri, 18 Dec 2015 15:54:35 -0500 Subject: [PATCH 077/288] moved kernel namespace in cpu backend inside cpu namespace --- src/backend/cpu/Array.hpp | 3 +-- src/backend/cpu/kernel/Array.hpp | 4 +++- src/backend/cpu/kernel/approx1.hpp | 6 +++--- src/backend/cpu/kernel/approx2.hpp | 5 +++-- src/backend/cpu/kernel/assign.hpp | 4 +++- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 437c47f786..adb72dc6c5 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -27,13 +27,12 @@ namespace cpu { template class Array; -} - // kernel::evalArray fn forward declaration namespace kernel { template void evalArray(cpu::Array in); } +} namespace cpu { diff --git a/src/backend/cpu/kernel/Array.hpp b/src/backend/cpu/kernel/Array.hpp index b3a02004d1..e492b92ff0 100644 --- a/src/backend/cpu/kernel/Array.hpp +++ b/src/backend/cpu/kernel/Array.hpp @@ -10,11 +10,12 @@ #include #include +namespace cpu +{ namespace kernel { using af::dim4; -using cpu::Array; template void evalArray(Array in) @@ -54,3 +55,4 @@ void evalArray(Array in) } } +} diff --git a/src/backend/cpu/kernel/approx1.hpp b/src/backend/cpu/kernel/approx1.hpp index 9dc681c8fa..63bae2d237 100644 --- a/src/backend/cpu/kernel/approx1.hpp +++ b/src/backend/cpu/kernel/approx1.hpp @@ -9,13 +9,12 @@ #include #include - +namespace cpu +{ namespace kernel { using af::dim4; -using cpu::scalar; -using cpu::Array; template struct approx1_op @@ -139,3 +138,4 @@ void approx1(Array output, Array const input, } } +} diff --git a/src/backend/cpu/kernel/approx2.hpp b/src/backend/cpu/kernel/approx2.hpp index 8f57b5cd64..f80dae17bb 100644 --- a/src/backend/cpu/kernel/approx2.hpp +++ b/src/backend/cpu/kernel/approx2.hpp @@ -10,12 +10,12 @@ #include #include +namespace cpu +{ namespace kernel { using af::dim4; -using cpu::scalar; -using cpu::Array; template struct approx2_op @@ -167,3 +167,4 @@ void approx2(Array output, Array const input, } } +} diff --git a/src/backend/cpu/kernel/assign.hpp b/src/backend/cpu/kernel/assign.hpp index 16a623f704..2621ba741f 100644 --- a/src/backend/cpu/kernel/assign.hpp +++ b/src/backend/cpu/kernel/assign.hpp @@ -10,11 +10,12 @@ #include #include +namespace cpu +{ namespace kernel { using af::dim4; -using cpu::Array; using std::vector; inline @@ -89,3 +90,4 @@ void assign(Array out, const Array rhs, const vector isSeq, } } +} From d03bb75f24481953484443cbed46df62264a5008 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 18 Dec 2015 17:09:11 -0500 Subject: [PATCH 078/288] moved bilateral, convolve, fftconvolve to cpu kernel namespace --- src/backend/cpu/bilateral.cpp | 77 +------ src/backend/cpu/convolve.cpp | 247 +---------------------- src/backend/cpu/fftconvolve.cpp | 236 ++-------------------- src/backend/cpu/kernel/approx1.hpp | 1 + src/backend/cpu/kernel/bilateral.hpp | 90 +++++++++ src/backend/cpu/kernel/convolve.hpp | 267 +++++++++++++++++++++++++ src/backend/cpu/kernel/fftconvolve.hpp | 227 +++++++++++++++++++++ 7 files changed, 611 insertions(+), 534 deletions(-) create mode 100644 src/backend/cpu/kernel/bilateral.hpp create mode 100644 src/backend/cpu/kernel/convolve.hpp create mode 100644 src/backend/cpu/kernel/fftconvolve.hpp diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp index ea38ea7dd7..c751f992d9 100644 --- a/src/backend/cpu/bilateral.cpp +++ b/src/backend/cpu/bilateral.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -22,87 +23,13 @@ using af::dim4; namespace cpu { -static inline dim_t clamp(int a, dim_t mn, dim_t mx) -{ - return (a < (int)mn ? mn : (a > (int)mx ? mx : a)); -} - -static inline unsigned getIdx(const dim4 &strides, - int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i * strides[0]); -} - -template -void bilateral_(Array out, const Array in, float s_sigma, float c_sigma) -{ - const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - - const dim4 ostrides = out.strides(); - - outType *outData = out.get(); - const inType *inData = in.get(); - - // clamp spatical and chromatic sigma's - float space_ = std::min(11.5f, std::max(s_sigma, 0.f)); - float color_ = std::max(c_sigma, 0.f); - const dim_t radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1); - const float svar = space_*space_; - const float cvar = color_*color_; - - for(dim_t b3=0; b3 Array bilateral(const Array &in, const float &s_sigma, const float &c_sigma) { in.eval(); const dim4 dims = in.dims(); Array out = createEmptyArray(dims); - getQueue().enqueue(bilateral_, out, in, s_sigma, c_sigma); + getQueue().enqueue(kernel::bilateral, out, in, s_sigma, c_sigma); return out; } diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp index 239b4f0924..218ba8e3c0 100644 --- a/src/backend/cpu/convolve.cpp +++ b/src/backend/cpu/convolve.cpp @@ -16,170 +16,13 @@ #include #include #include +#include using af::dim4; namespace cpu { -template -void one2one_1d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &sStrides) -{ - dim_t start = (expand ? 0 : fDims[0]/2); - dim_t end = (expand ? oDims[0] : start + sDims[0]); - for(dim_t i=start; i=0 &&iIdx -void one2one_2d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides, - dim4 const &sStrides, dim4 const &fStrides) -{ - dim_t jStart = (expand ? 0 : fDims[1]/2); - dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]); - dim_t iStart = (expand ? 0 : fDims[0]/2); - dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]); - - for(dim_t j=jStart; j=0 && jIdx=0 && iIdx -void one2one_3d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides, - dim4 const &sStrides, dim4 const &fStrides) -{ - dim_t kStart = (expand ? 0 : fDims[2]/2); - dim_t kEnd = (expand ? oDims[2] : kStart + sDims[2]); - dim_t jStart = (expand ? 0 : fDims[1]/2); - dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]); - dim_t iStart = (expand ? 0 : fDims[0]/2); - dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]); - - for(dim_t k=kStart; k=0 && kIdx=0 && jIdx=0 && iIdx -void convolve_nd(T *optr, T const *iptr, accT const *fptr, - dim4 const &oDims, dim4 const &sDims, dim4 const &fDims, - dim4 const &oStrides, dim4 const &sStrides, dim4 const &fStrides, - ConvolveBatchKind kind) -{ - dim_t out_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ - dim_t in_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ - dim_t filt_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ - dim_t batch[4] = {0, 1, 1, 1}; /* first value is never used, and declared for code simplicity */ - - for (dim_t i=1; i<4; ++i) { - switch(kind) { - case CONVOLVE_BATCH_SIGNAL: - out_step[i] = oStrides[i]; - in_step[i] = sStrides[i]; - if (i>=baseDim) batch[i] = sDims[i]; - break; - case CONVOLVE_BATCH_SAME: - out_step[i] = oStrides[i]; - in_step[i] = sStrides[i]; - filt_step[i] = fStrides[i]; - if (i>=baseDim) batch[i] = sDims[i]; - break; - case CONVOLVE_BATCH_KERNEL: - out_step[i] = oStrides[i]; - filt_step[i] = fStrides[i]; - if (i>=baseDim) batch[i] = fDims[i]; - break; - default: - break; - } - } - - for (dim_t b3=0; b3(out, in, filt, oDims, sDims, fDims, sStrides); break; - case 2: one2one_2d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; - case 3: one2one_3d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; - } - } - } - } -} - template Array convolve(Array const& signal, Array const& filter, ConvolveBatchKind kind) { @@ -188,7 +31,6 @@ Array convolve(Array const& signal, Array const& filter, ConvolveBat auto sDims = signal.dims(); auto fDims = filter.dims(); - auto sStrides = signal.strides(); dim4 oDims(1); if (expand) { @@ -209,52 +51,11 @@ Array convolve(Array const& signal, Array const& filter, ConvolveBat Array out = createEmptyArray(oDims); - getQueue().enqueue(convolve_nd,out.get(), signal.get(), filter.get(), - oDims, sDims, fDims, out.strides(), sStrides, filter.strides(), kind); + getQueue().enqueue(kernel::convolve_nd,out, signal, filter, kind); return out; } -template -void convolve2_separable(T *optr, T const *iptr, accT const *fptr, - dim4 const &oDims, dim4 const &sDims, dim4 const &orgDims, dim_t fDim, - dim4 const &oStrides, dim4 const &sStrides, dim_t fStride) -{ - for(dim_t j=0; j>1); - - for(dim_t i=0; i>1); - - accT accum = scalar(0); - - for(dim_t f=0; f=0 && offi=0 && cj(0)); - } else { - dim_t offj = cj - f; - bool isCIValid = ci>=0 && ci=0 && offj(0)); - } - - accum += accT(s_val * f_val); - } - optr[iOff+jOff] = T(accum); - } - } -} - template Array convolve2(Array const& signal, Array const& c_filter, Array const& r_filter) { @@ -262,18 +63,16 @@ Array convolve2(Array const& signal, Array const& c_filter, Array convolve2(Array const& signal, Array const& c_filter, Array out = createEmptyArray(oDims); - auto func = [=] (Array out) { - Array temp = createEmptyArray(tDims); - auto tStrides = temp.strides(); - auto oStrides = out.strides(); - - for (dim_t b3=0; b3(tptr, iptr, c_filter.get(), - tDims, sDims, sDims, cflen, - tStrides, sStrides, c_filter.strides()[0]); - - convolve2_separable(optr, tptr, r_filter.get(), - oDims, tDims, sDims, rflen, - oStrides, tStrides, r_filter.strides()[0]); - } - } - }; - - getQueue().enqueue(func, out); + getQueue().enqueue(kernel::convolve2, out, signal, c_filter, r_filter, tDims); return out; } diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp index 6172af86a6..2678c7b6f0 100644 --- a/src/backend/cpu/fftconvolve.cpp +++ b/src/backend/cpu/fftconvolve.cpp @@ -19,216 +19,11 @@ #include #include #include +#include namespace cpu { -template -void packData(Array out, const af::dim4 od, const af::dim4 os, Array const in) -{ - To* out_ptr = out.get(); - - const af::dim4 id = in.dims(); - const af::dim4 is = in.strides(); - const Ti* in_ptr = in.get(); - - int id0_half = divup(id[0], 2); - bool odd_id0 = (id[0] % 2 == 1); - - for (int d3 = 0; d3 < (int)od[3]; d3++) { - for (int d2 = 0; d2 < (int)od[2]; d2++) { - for (int d1 = 0; d1 < (int)od[1]; d1++) { - for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { - const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - - if (d0 < (int)id0_half && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) { - const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0; - out_ptr[oidx] = (To)in_ptr[iidx]; - if (d0 == id0_half-1 && odd_id0) - out_ptr[oidx+1] = (To)0; - else - out_ptr[oidx+1] = (To)in_ptr[iidx+id0_half]; - } - else { - // Pad remaining elements with 0s - out_ptr[oidx] = (To)0; - out_ptr[oidx+1] = (To)0; - } - } - } - } - } -} - -template -void padArray_(Array out, const af::dim4 od, const af::dim4 os, - Array const in, const dim_t offset) -{ - To* out_ptr = out.get() + offset; - const af::dim4 id = in.dims(); - const af::dim4 is = in.strides(); - const Ti* in_ptr = in.get(); - - for (int d3 = 0; d3 < (int)od[3]; d3++) { - for (int d2 = 0; d2 < (int)od[2]; d2++) { - for (int d1 = 0; d1 < (int)od[1]; d1++) { - for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { - const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - - if (d0 < (int)id[0] && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) { - // Copy input elements to real elements, set imaginary elements to 0 - const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0; - out_ptr[oidx] = (To)in_ptr[iidx]; - out_ptr[oidx+1] = (To)0; - } - else { - // Pad remaining of the matrix to 0s - out_ptr[oidx] = (To)0; - out_ptr[oidx+1] = (To)0; - } - } - } - } - } -} - -template -void complexMultiply(Array packed, const af::dim4 sig_dims, const af::dim4 sig_strides, - const af::dim4 fit_dims, const af::dim4 fit_strides, - ConvolveBatchKind kind, const dim_t offset) -{ - T* out_ptr = packed.get() + (kind==CONVOLVE_BATCH_KERNEL? offset : 0); - T* in1_ptr = packed.get(); - T* in2_ptr = packed.get() + offset; - - const dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims); - const dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides); - const dim4& i1d = sig_dims; - const dim4& i2d = fit_dims; - const dim4& i1s = sig_strides; - const dim4& i2s = fit_strides; - - for (int d3 = 0; d3 < (int)od[3]; d3++) { - for (int d2 = 0; d2 < (int)od[2]; d2++) { - for (int d1 = 0; d1 < (int)od[1]; d1++) { - for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { - if (kind == CONVOLVE_BATCH_NONE || kind == CONVOLVE_BATCH_SAME) { - // Complex multiply each signal to equivalent filter - const int ridx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - const int iidx = ridx + 1; - - T a = in1_ptr[ridx]; - T b = in1_ptr[iidx]; - T c = in2_ptr[ridx]; - T d = in2_ptr[iidx]; - - T ac = a*c; - T bd = b*d; - - out_ptr[ridx] = ac - bd; - out_ptr[iidx] = (a+b) * (c+d) - ac - bd; - } - else if (kind == CONVOLVE_BATCH_SIGNAL) { - // Complex multiply all signals to filter - const int ridx1 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - const int iidx1 = ridx1 + 1; - const int ridx2 = ridx1 % (i2s[3] * i2d[3]); - const int iidx2 = iidx1 % (i2s[3] * i2d[3]); - - T a = in1_ptr[ridx1]; - T b = in1_ptr[iidx1]; - T c = in2_ptr[ridx2]; - T d = in2_ptr[iidx2]; - - T ac = a*c; - T bd = b*d; - - out_ptr[ridx1] = ac - bd; - out_ptr[iidx1] = (a+b) * (c+d) - ac - bd; - } - else if (kind == CONVOLVE_BATCH_KERNEL) { - // Complex multiply signal to all filters - const int ridx2 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; - const int iidx2 = ridx2 + 1; - const int ridx1 = ridx2 % (i1s[3] * i1d[3]); - const int iidx1 = iidx2 % (i1s[3] * i1d[3]); - - T a = in1_ptr[ridx1]; - T b = in1_ptr[iidx1]; - T c = in2_ptr[ridx2]; - T d = in2_ptr[iidx2]; - - T ac = a*c; - T bd = b*d; - - out_ptr[ridx2] = ac - bd; - out_ptr[iidx2] = (a+b) * (c+d) - ac - bd; - } - } - } - } - } -} - -template -void reorderOutput(To* out_ptr, const af::dim4& od, const af::dim4& os, - const Ti* in_ptr, const af::dim4& id, const af::dim4& is, - const af::dim4& fd, const int half_di0, const int baseDim, - const int fftScale, const bool expand) -{ - for (int d3 = 0; d3 < (int)od[3]; d3++) { - for (int d2 = 0; d2 < (int)od[2]; d2++) { - for (int d1 = 0; d1 < (int)od[1]; d1++) { - for (int d0 = 0; d0 < (int)od[0]; d0++) { - int id0, id1, id2, id3; - if (expand) { - id0 = d0; - id1 = d1 * is[1]; - id2 = d2 * is[2]; - id3 = d3 * is[3]; - } - else { - id0 = d0 + fd[0]/2; - id1 = (d1 + (baseDim > 1)*(fd[1]/2)) * is[1]; - id2 = (d2 + (baseDim > 2)*(fd[2]/2)) * is[2]; - id3 = d3 * is[3]; - } - - int oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0; - - // Divide output elements to cuFFT resulting scale, round result if output - // type is single or double precision floating-point - if (id0 < half_di0) { - // Copy top elements - int iidx = id3 + id2 + id1 + id0 * 2; - if (roundOut) - out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale)); - else - out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale); - } - else if (id0 < half_di0 + (int)fd[0] - 1) { - // Add signal and filter elements to central part - int iidx1 = id3 + id2 + id1 + id0 * 2; - int iidx2 = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1; - if (roundOut) - out_ptr[oidx] = (To)roundf((float)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale)); - else - out_ptr[oidx] = (To)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale); - } - else { - // Copy bottom elements - const int iidx = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1; - if (roundOut) - out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale)); - else - out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale); - } - } - } - } - } -} - template Array fftconvolve(Array const& signal, Array const& filter, const bool expand, ConvolveBatchKind kind) @@ -289,11 +84,11 @@ Array fftconvolve(Array const& signal, Array const& filter, // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s - getQueue().enqueue(packData, packed, sig_tmp_dims, sig_tmp_strides, signal); + getQueue().enqueue(kernel::packData, packed, sig_tmp_dims, sig_tmp_strides, signal); // Pad filter array with 0s const dim_t offset = sig_tmp_strides[3]*sig_tmp_dims[3]; - getQueue().enqueue(padArray_, packed, filter_tmp_dims, filter_tmp_strides, + getQueue().enqueue(kernel::padArray, packed, filter_tmp_dims, filter_tmp_strides, filter, offset); dim4 fftDims(1, 1, 1, 1); @@ -346,7 +141,7 @@ Array fftconvolve(Array const& signal, Array const& filter, getQueue().enqueue(upstream_dft, packed, fftDims); // Multiply filter and signal FFT arrays - getQueue().enqueue(complexMultiply, packed, + getQueue().enqueue(kernel::complexMultiply, packed, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims, filter_tmp_strides, kind, offset); @@ -416,10 +211,10 @@ Array fftconvolve(Array const& signal, Array const& filter, Array out = createEmptyArray(oDims); - auto reorderFunc = [=] (Array out, Array packed, - const Array filter, const dim_t sig_hald_d0, const dim_t fftScale, - const dim4 sig_tmp_dims, const dim4 sig_tmp_strides, - const dim4 filter_tmp_dims, const dim4 filter_tmp_strides) { + auto reorderFunc = [=](Array out, Array packed, + const Array filter, const dim_t sig_hald_d0, const dim_t fftScale, + const dim4 sig_tmp_dims, const dim4 sig_tmp_strides, + const dim4 filter_tmp_dims, const dim4 filter_tmp_strides) { T* out_ptr = out.get(); const af::dim4 out_dims = out.dims(); const af::dim4 out_strides = out.strides(); @@ -432,17 +227,16 @@ Array fftconvolve(Array const& signal, Array const& filter, // Reorder the output if (kind == CONVOLVE_BATCH_KERNEL) { - reorderOutput - (out_ptr, out_dims, out_strides, - filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - filter_dims, sig_half_d0, baseDim, fftScale, expand); + kernel::reorderHelper(out_ptr, out_dims, out_strides, + filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, + filter_dims, sig_half_d0, baseDim, fftScale, expand); } else { - reorderOutput - (out_ptr, out_dims, out_strides, - sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - filter_dims, sig_half_d0, baseDim, fftScale, expand); + kernel::reorderHelper(out_ptr, out_dims, out_strides, + sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, + filter_dims, sig_half_d0, baseDim, fftScale, expand); } }; + getQueue().enqueue(reorderFunc, out, packed, filter, sig_half_d0, fftScale, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims, filter_tmp_strides); diff --git a/src/backend/cpu/kernel/approx1.hpp b/src/backend/cpu/kernel/approx1.hpp index 63bae2d237..51c48048c1 100644 --- a/src/backend/cpu/kernel/approx1.hpp +++ b/src/backend/cpu/kernel/approx1.hpp @@ -9,6 +9,7 @@ #include #include + namespace cpu { namespace kernel diff --git a/src/backend/cpu/kernel/bilateral.hpp b/src/backend/cpu/kernel/bilateral.hpp new file mode 100644 index 0000000000..2b5764fd37 --- /dev/null +++ b/src/backend/cpu/kernel/bilateral.hpp @@ -0,0 +1,90 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +inline +dim_t clamp(int a, dim_t mn, dim_t mx) +{ + return (a < (int)mn ? mn : (a > (int)mx ? mx : a)); +} + +inline +unsigned getIdx(const dim4 &strides, int i, int j = 0, int k = 0, int l = 0) +{ + return (l * strides[3] + k * strides[2] + j * strides[1] + i * strides[0]); +} + +template +void bilateral(Array out, const Array in, float s_sigma, float c_sigma) +{ + const dim4 dims = in.dims(); + const dim4 istrides = in.strides(); + + const dim4 ostrides = out.strides(); + + outType *outData = out.get(); + const inType *inData = in.get(); + + // clamp spatical and chromatic sigma's + float space_ = std::min(11.5f, std::max(s_sigma, 0.f)); + float color_ = std::max(c_sigma, 0.f); + const dim_t radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1); + const float svar = space_*space_; + const float cvar = color_*color_; + + for(dim_t b3=0; b3 + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void one2one_1d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, + dim4 const &sDims, dim4 const &fDims, dim4 const &sStrides) +{ + dim_t start = (expand ? 0 : fDims[0]/2); + dim_t end = (expand ? oDims[0] : start + sDims[0]); + for(dim_t i=start; i=0 &&iIdx +void one2one_2d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, + dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides, + dim4 const &sStrides, dim4 const &fStrides) +{ + dim_t jStart = (expand ? 0 : fDims[1]/2); + dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]); + dim_t iStart = (expand ? 0 : fDims[0]/2); + dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]); + + for(dim_t j=jStart; j=0 && jIdx=0 && iIdx +void one2one_3d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, + dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides, + dim4 const &sStrides, dim4 const &fStrides) +{ + dim_t kStart = (expand ? 0 : fDims[2]/2); + dim_t kEnd = (expand ? oDims[2] : kStart + sDims[2]); + dim_t jStart = (expand ? 0 : fDims[1]/2); + dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]); + dim_t iStart = (expand ? 0 : fDims[0]/2); + dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]); + + for(dim_t k=kStart; k=0 && kIdx=0 && jIdx=0 && iIdx +void convolve_nd(Array out, Array const signal, Array const filter, ConvolveBatchKind kind) +{ + T * optr = out.get(); + T const * const iptr = signal.get(); + accT const * const fptr = filter.get(); + + dim4 const oDims = out.dims(); + dim4 const sDims = signal.dims(); + dim4 const fDims = filter.dims(); + + dim4 const oStrides = out.strides(); + dim4 const sStrides = signal.strides(); + dim4 const fStrides = filter.strides(); + + dim_t out_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ + dim_t in_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ + dim_t filt_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ + dim_t batch[4] = {0, 1, 1, 1}; /* first value is never used, and declared for code simplicity */ + + for (dim_t i=1; i<4; ++i) { + switch(kind) { + case CONVOLVE_BATCH_SIGNAL: + out_step[i] = oStrides[i]; + in_step[i] = sStrides[i]; + if (i>=baseDim) batch[i] = sDims[i]; + break; + case CONVOLVE_BATCH_SAME: + out_step[i] = oStrides[i]; + in_step[i] = sStrides[i]; + filt_step[i] = fStrides[i]; + if (i>=baseDim) batch[i] = sDims[i]; + break; + case CONVOLVE_BATCH_KERNEL: + out_step[i] = oStrides[i]; + filt_step[i] = fStrides[i]; + if (i>=baseDim) batch[i] = fDims[i]; + break; + default: + break; + } + } + + for (dim_t b3=0; b3(out, in, filt, oDims, sDims, fDims, sStrides); break; + case 2: one2one_2d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; + case 3: one2one_3d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; + } + } + } + } +} + +template +void convolve2_separable(T *optr, T const *iptr, accT const *fptr, + dim4 const &oDims, dim4 const &sDims, dim4 const &orgDims, dim_t fDim, + dim4 const &oStrides, dim4 const &sStrides, dim_t fStride) +{ + for(dim_t j=0; j>1); + + for(dim_t i=0; i>1); + + accT accum = scalar(0); + + for(dim_t f=0; f=0 && offi=0 && cj(0)); + } else { + dim_t offj = cj - f; + bool isCIValid = ci>=0 && ci=0 && offj(0)); + } + + accum += accT(s_val * f_val); + } + optr[iOff+jOff] = T(accum); + } + } +} + +template +void convolve2(Array out, Array const signal, + Array const c_filter, Array const r_filter, + dim4 const tDims) +{ + Array temp = createEmptyArray(tDims); + + dim_t cflen = (dim_t)c_filter.elements(); + dim_t rflen = (dim_t)r_filter.elements(); + + auto oDims = out.dims(); + auto sDims = signal.dims(); + + auto oStrides = out.strides(); + auto sStrides = signal.strides(); + auto tStrides = temp.strides(); + + for (dim_t b3=0; b3(tptr, iptr, c_filter.get(), + tDims, sDims, sDims, cflen, + tStrides, sStrides, c_filter.strides()[0]); + + convolve2_separable(optr, tptr, r_filter.get(), + oDims, tDims, sDims, rflen, + oStrides, tStrides, r_filter.strides()[0]); + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/fftconvolve.hpp b/src/backend/cpu/kernel/fftconvolve.hpp new file mode 100644 index 0000000000..30bac668f1 --- /dev/null +++ b/src/backend/cpu/kernel/fftconvolve.hpp @@ -0,0 +1,227 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void packData(Array out, const af::dim4 od, const af::dim4 os, Array const in) +{ + To* out_ptr = out.get(); + + const af::dim4 id = in.dims(); + const af::dim4 is = in.strides(); + const Ti* in_ptr = in.get(); + + int id0_half = divup(id[0], 2); + bool odd_id0 = (id[0] % 2 == 1); + + for (int d3 = 0; d3 < (int)od[3]; d3++) { + for (int d2 = 0; d2 < (int)od[2]; d2++) { + for (int d1 = 0; d1 < (int)od[1]; d1++) { + for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { + const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + + if (d0 < (int)id0_half && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) { + const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0; + out_ptr[oidx] = (To)in_ptr[iidx]; + if (d0 == id0_half-1 && odd_id0) + out_ptr[oidx+1] = (To)0; + else + out_ptr[oidx+1] = (To)in_ptr[iidx+id0_half]; + } + else { + // Pad remaining elements with 0s + out_ptr[oidx] = (To)0; + out_ptr[oidx+1] = (To)0; + } + } + } + } + } +} + +template +void padArray(Array out, const af::dim4 od, const af::dim4 os, + Array const in, const dim_t offset) +{ + To* out_ptr = out.get() + offset; + const af::dim4 id = in.dims(); + const af::dim4 is = in.strides(); + const Ti* in_ptr = in.get(); + + for (int d3 = 0; d3 < (int)od[3]; d3++) { + for (int d2 = 0; d2 < (int)od[2]; d2++) { + for (int d1 = 0; d1 < (int)od[1]; d1++) { + for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { + const dim_t oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + + if (d0 < (int)id[0] && d1 < (int)id[1] && d2 < (int)id[2] && d3 < (int)id[3]) { + // Copy input elements to real elements, set imaginary elements to 0 + const dim_t iidx = d3*is[3] + d2*is[2] + d1*is[1] + d0; + out_ptr[oidx] = (To)in_ptr[iidx]; + out_ptr[oidx+1] = (To)0; + } + else { + // Pad remaining of the matrix to 0s + out_ptr[oidx] = (To)0; + out_ptr[oidx+1] = (To)0; + } + } + } + } + } +} + +template +void complexMultiply(Array packed, const af::dim4 sig_dims, const af::dim4 sig_strides, + const af::dim4 fit_dims, const af::dim4 fit_strides, + ConvolveBatchKind kind, const dim_t offset) +{ + T* out_ptr = packed.get() + (kind==CONVOLVE_BATCH_KERNEL? offset : 0); + T* in1_ptr = packed.get(); + T* in2_ptr = packed.get() + offset; + + const dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims); + const dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides); + const dim4& i1d = sig_dims; + const dim4& i2d = fit_dims; + const dim4& i1s = sig_strides; + const dim4& i2s = fit_strides; + + for (int d3 = 0; d3 < (int)od[3]; d3++) { + for (int d2 = 0; d2 < (int)od[2]; d2++) { + for (int d1 = 0; d1 < (int)od[1]; d1++) { + for (int d0 = 0; d0 < (int)od[0] / 2; d0++) { + if (kind == CONVOLVE_BATCH_NONE || kind == CONVOLVE_BATCH_SAME) { + // Complex multiply each signal to equivalent filter + const int ridx = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + const int iidx = ridx + 1; + + T a = in1_ptr[ridx]; + T b = in1_ptr[iidx]; + T c = in2_ptr[ridx]; + T d = in2_ptr[iidx]; + + T ac = a*c; + T bd = b*d; + + out_ptr[ridx] = ac - bd; + out_ptr[iidx] = (a+b) * (c+d) - ac - bd; + } + else if (kind == CONVOLVE_BATCH_SIGNAL) { + // Complex multiply all signals to filter + const int ridx1 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + const int iidx1 = ridx1 + 1; + const int ridx2 = ridx1 % (i2s[3] * i2d[3]); + const int iidx2 = iidx1 % (i2s[3] * i2d[3]); + + T a = in1_ptr[ridx1]; + T b = in1_ptr[iidx1]; + T c = in2_ptr[ridx2]; + T d = in2_ptr[iidx2]; + + T ac = a*c; + T bd = b*d; + + out_ptr[ridx1] = ac - bd; + out_ptr[iidx1] = (a+b) * (c+d) - ac - bd; + } + else if (kind == CONVOLVE_BATCH_KERNEL) { + // Complex multiply signal to all filters + const int ridx2 = d3*os[3] + d2*os[2] + d1*os[1] + d0*2; + const int iidx2 = ridx2 + 1; + const int ridx1 = ridx2 % (i1s[3] * i1d[3]); + const int iidx1 = iidx2 % (i1s[3] * i1d[3]); + + T a = in1_ptr[ridx1]; + T b = in1_ptr[iidx1]; + T c = in2_ptr[ridx2]; + T d = in2_ptr[iidx2]; + + T ac = a*c; + T bd = b*d; + + out_ptr[ridx2] = ac - bd; + out_ptr[iidx2] = (a+b) * (c+d) - ac - bd; + } + } + } + } + } +} + +template +void reorderHelper(To* out_ptr, const af::dim4& od, const af::dim4& os, + const Ti* in_ptr, const af::dim4& id, const af::dim4& is, + const af::dim4& fd, const int half_di0, const int baseDim, + const int fftScale, const bool expand) +{ + for (int d3 = 0; d3 < (int)od[3]; d3++) { + for (int d2 = 0; d2 < (int)od[2]; d2++) { + for (int d1 = 0; d1 < (int)od[1]; d1++) { + for (int d0 = 0; d0 < (int)od[0]; d0++) { + int id0, id1, id2, id3; + if (expand) { + id0 = d0; + id1 = d1 * is[1]; + id2 = d2 * is[2]; + id3 = d3 * is[3]; + } + else { + id0 = d0 + fd[0]/2; + id1 = (d1 + (baseDim > 1)*(fd[1]/2)) * is[1]; + id2 = (d2 + (baseDim > 2)*(fd[2]/2)) * is[2]; + id3 = d3 * is[3]; + } + + int oidx = d3*os[3] + d2*os[2] + d1*os[1] + d0; + + // Divide output elements to cuFFT resulting scale, round result if output + // type is single or double precision floating-point + if (id0 < half_di0) { + // Copy top elements + int iidx = id3 + id2 + id1 + id0 * 2; + if (roundOut) + out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale)); + else + out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale); + } + else if (id0 < half_di0 + (int)fd[0] - 1) { + // Add signal and filter elements to central part + int iidx1 = id3 + id2 + id1 + id0 * 2; + int iidx2 = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1; + if (roundOut) + out_ptr[oidx] = (To)roundf((float)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale)); + else + out_ptr[oidx] = (To)((in_ptr[iidx1] + in_ptr[iidx2]) / fftScale); + } + else { + // Copy bottom elements + const int iidx = id3 + id2 + id1 + (id0 - half_di0) * 2 + 1; + if (roundOut) + out_ptr[oidx] = (To)roundf((float)(in_ptr[iidx] / fftScale)); + else + out_ptr[oidx] = (To)(in_ptr[iidx] / fftScale); + } + } + } + } + } +} + +} +} From e8f0242168e24f24d432606fc5974900e6ea206a Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 18 Dec 2015 17:33:19 -0500 Subject: [PATCH 079/288] moved copy queue fns from cpu backend to kernel namespace --- src/backend/cpu/copy.cpp | 79 ++--------------------------- src/backend/cpu/kernel/copy.hpp | 90 +++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 74 deletions(-) create mode 100644 src/backend/cpu/kernel/copy.hpp diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index eef5e0e302..84cb0d1a54 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -20,32 +20,11 @@ #include #include #include +#include namespace cpu { -template -static void stridedCopy(T* dst, const dim4& ostrides, const T* src, const dim4 &dims, const dim4 &strides, unsigned dim) -{ - if(dim == 0) { - if(strides[dim] == 1) { - //FIXME: Check for errors / exceptions - memcpy(dst, src, dims[dim] * sizeof(T)); - } else { - for(dim_t i = 0; i < dims[dim]; i++) { - dst[i] = src[strides[dim]*i]; - } - } - } else { - for(dim_t i = dims[dim]; i > 0; i--) { - stridedCopy(dst, ostrides, src, dims, strides, dim - 1); - src += strides[dim]; - dst += ostrides[dim]; - } - } -} - -// Assigns to single elements template void copyData(T *to, const Array &from) { @@ -56,7 +35,7 @@ void copyData(T *to, const Array &from) memcpy(to, from.get(), from.elements()*sizeof(T)); } else { dim4 ostrides = calcStrides(from.dims()); - stridedCopy(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1); + kernel::stridedCopy(to, ostrides, from.get(), from.dims(), from.strides(), from.ndims() - 1); } } @@ -68,59 +47,11 @@ Array copyArray(const Array &A) return out; } -template -static void copy(Array dst, const Array src, outType default_value, double factor) -{ - dim4 src_dims = src.dims(); - dim4 dst_dims = dst.dims(); - dim4 src_strides = src.strides(); - dim4 dst_strides = dst.strides(); - - const inType * src_ptr = src.get(); - outType * dst_ptr = dst.get(); - - dim_t trgt_l = std::min(dst_dims[3], src_dims[3]); - dim_t trgt_k = std::min(dst_dims[2], src_dims[2]); - dim_t trgt_j = std::min(dst_dims[1], src_dims[1]); - dim_t trgt_i = std::min(dst_dims[0], src_dims[0]); - - for(dim_t l=0; l void multiply_inplace(Array &in, double val) { in.eval(); - getQueue().enqueue(copy, in, in, 0, val); + getQueue().enqueue(kernel::copy, in, in, 0, val); } template @@ -132,7 +63,7 @@ Array padArray(Array const &in, dim4 const &dims, in.eval(); // FIXME: getQueue().sync(); - getQueue().enqueue(copy, ret, in, outType(default_value), factor); + getQueue().enqueue(kernel::copy, ret, in, outType(default_value), factor); return ret; } @@ -141,7 +72,7 @@ void copyArray(Array &out, Array const &in) { out.eval(); in.eval(); - getQueue().enqueue(copy, out, in, scalar(0), 1.0); + getQueue().enqueue(kernel::copy, out, in, scalar(0), 1.0); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/kernel/copy.hpp b/src/backend/cpu/kernel/copy.hpp new file mode 100644 index 0000000000..063fb29f0c --- /dev/null +++ b/src/backend/cpu/kernel/copy.hpp @@ -0,0 +1,90 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void stridedCopy(T* dst, const dim4& ostrides, const T* src, + const dim4 &dims, const dim4 &strides, unsigned dim) +{ + if(dim == 0) { + if(strides[dim] == 1) { + //FIXME: Check for errors / exceptions + memcpy(dst, src, dims[dim] * sizeof(T)); + } else { + for(dim_t i = 0; i < dims[dim]; i++) { + dst[i] = src[strides[dim]*i]; + } + } + } else { + for(dim_t i = dims[dim]; i > 0; i--) { + stridedCopy(dst, ostrides, src, dims, strides, dim - 1); + src += strides[dim]; + dst += ostrides[dim]; + } + } +} + +template +void copy(Array dst, const Array src, outType default_value, double factor) +{ + dim4 src_dims = src.dims(); + dim4 dst_dims = dst.dims(); + dim4 src_strides = src.strides(); + dim4 dst_strides = dst.strides(); + + const inType * src_ptr = src.get(); + outType * dst_ptr = dst.get(); + + dim_t trgt_l = std::min(dst_dims[3], src_dims[3]); + dim_t trgt_k = std::min(dst_dims[2], src_dims[2]); + dim_t trgt_j = std::min(dst_dims[1], src_dims[1]); + dim_t trgt_i = std::min(dst_dims[0], src_dims[0]); + + for(dim_t l=0; l Date: Fri, 18 Dec 2015 17:46:19 -0500 Subject: [PATCH 080/288] Moved diagonal cpu implementation to kernel namespace --- src/backend/cpu/diagonal.cpp | 43 ++---------------- src/backend/cpu/kernel/diagonal.hpp | 67 +++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 40 deletions(-) create mode 100644 src/backend/cpu/kernel/diagonal.hpp diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp index 9af78459c1..6c20f2e7f2 100644 --- a/src/backend/cpu/diagonal.cpp +++ b/src/backend/cpu/diagonal.cpp @@ -17,6 +17,7 @@ #include #include #include +#include namespace cpu { @@ -30,25 +31,7 @@ Array diagCreate(const Array &in, const int num) int batch = in.dims()[1]; Array out = createEmptyArray(dim4(size, size, batch)); - auto func = [=] (Array out, const Array in) { - const T *iptr = in.get(); - T *optr = out.get(); - - for (int k = 0; k < batch; k++) { - for (int j = 0; j < size; j++) { - for (int i = 0; i < size; i++) { - T val = scalar(0); - if (i == j - num) { - val = (num > 0) ? iptr[i] : iptr[j]; - } - optr[i + j * out.strides()[1]] = val; - } - } - optr += out.strides()[2]; - iptr += in.strides()[1]; - } - }; - getQueue().enqueue(func, out, in); + getQueue().enqueue(kernel::diagCreate, out, in, num); return out; } @@ -62,27 +45,7 @@ Array diagExtract(const Array &in, const int num) dim_t size = std::max(idims[0], idims[1]) - std::abs(num); Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); - auto func = [=] (Array out, const Array in) { - const dim4 odims = out.dims(); - - const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); - - for (int l = 0; l < (int)odims[3]; l++) { - - for (int k = 0; k < (int)odims[2]; k++) { - const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; - T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; - - for (int i = 0; i < (int)odims[0]; i++) { - T val = scalar(0); - if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; - optr[i] = val; - } - } - } - }; - - getQueue().enqueue(func, out, in); + getQueue().enqueue(kernel::diagExtract, out, in, num); return out; } diff --git a/src/backend/cpu/kernel/diagonal.hpp b/src/backend/cpu/kernel/diagonal.hpp new file mode 100644 index 0000000000..596080b108 --- /dev/null +++ b/src/backend/cpu/kernel/diagonal.hpp @@ -0,0 +1,67 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void diagCreate(Array out, Array const in, int const num) +{ + int batch = in.dims()[1]; + int size = out.dims()[0]; + + const T *iptr = in.get(); + T *optr = out.get(); + + for (int k = 0; k < batch; k++) { + for (int j = 0; j < size; j++) { + for (int i = 0; i < size; i++) { + T val = scalar(0); + if (i == j - num) { + val = (num > 0) ? iptr[i] : iptr[j]; + } + optr[i + j * out.strides()[1]] = val; + } + } + optr += out.strides()[2]; + iptr += in.strides()[1]; + } +} + +template +void diagExtract(Array out, Array const in, int const num) +{ + const dim4 odims = out.dims(); + const dim4 idims = in.dims(); + + const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); + + for (int l = 0; l < (int)odims[3]; l++) { + + for (int k = 0; k < (int)odims[2]; k++) { + const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; + T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; + + for (int i = 0; i < (int)odims[0]; i++) { + T val = scalar(0); + if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; + optr[i] = val; + } + } + } +} + +} +} From 71298c69887f23b3f18354098b6cadd62968158b Mon Sep 17 00:00:00 2001 From: pradeep Date: Sat, 19 Dec 2015 00:43:53 -0500 Subject: [PATCH 081/288] moved diff, fast, gradient, harris, histogram to kernel namespace --- src/backend/cpu/diff.cpp | 75 +-------- src/backend/cpu/fast.cpp | 227 +------------------------- src/backend/cpu/gradient.cpp | 69 +------- src/backend/cpu/harris.cpp | 133 +--------------- src/backend/cpu/histogram.cpp | 32 +--- src/backend/cpu/kernel/diff.hpp | 91 +++++++++++ src/backend/cpu/kernel/fast.hpp | 228 +++++++++++++++++++++++++++ src/backend/cpu/kernel/gradient.hpp | 87 ++++++++++ src/backend/cpu/kernel/harris.hpp | 139 ++++++++++++++++ src/backend/cpu/kernel/histogram.hpp | 47 ++++++ 10 files changed, 611 insertions(+), 517 deletions(-) create mode 100644 src/backend/cpu/kernel/diff.hpp create mode 100644 src/backend/cpu/kernel/fast.hpp create mode 100644 src/backend/cpu/kernel/gradient.hpp create mode 100644 src/backend/cpu/kernel/harris.hpp create mode 100644 src/backend/cpu/kernel/histogram.hpp diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp index 8f9c0f13be..3f639ca46f 100644 --- a/src/backend/cpu/diff.cpp +++ b/src/backend/cpu/diff.cpp @@ -9,62 +9,25 @@ #include #include -#include -#include #include #include +#include namespace cpu { -unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i); -} - template Array diff1(const Array &in, const int dim) { in.eval(); - // Bool for dimension - bool is_dim0 = dim == 0; - bool is_dim1 = dim == 1; - bool is_dim2 = dim == 2; - bool is_dim3 = dim == 3; // Decrement dimension of select dimension af::dim4 dims = in.dims(); dims[dim]--; - // Create output placeholder Array outArray = createEmptyArray(dims); - auto func = [=] (Array outArray, Array in) { - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - // TODO: Improve this - for(dim_t l = 0; l < dims[3]; l++) { - for(dim_t k = 0; k < dims[2]; k++) { - for(dim_t j = 0; j < dims[1]; j++) { - for(dim_t i = 0; i < dims[0]; i++) { - // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), - i + is_dim0, j + is_dim1, - k + is_dim2, l + is_dim3); - int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); - outPtr[odx] = inPtr[jdx] - inPtr[idx]; - } - } - } - } - }; - getQueue().enqueue(func, outArray, in); + getQueue().enqueue(kernel::diff1, outArray, in, dim); return outArray; } @@ -73,46 +36,14 @@ template Array diff2(const Array &in, const int dim) { in.eval(); - // Bool for dimension - bool is_dim0 = dim == 0; - bool is_dim1 = dim == 1; - bool is_dim2 = dim == 2; - bool is_dim3 = dim == 3; // Decrement dimension of select dimension af::dim4 dims = in.dims(); dims[dim] -= 2; - // Create output placeholder Array outArray = createEmptyArray(dims); - auto func = [=] (Array outArray, Array in) { - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = outArray.get(); - - // TODO: Improve this - for(dim_t l = 0; l < dims[3]; l++) { - for(dim_t k = 0; k < dims[2]; k++) { - for(dim_t j = 0; j < dims[1]; j++) { - for(dim_t i = 0; i < dims[0]; i++) { - // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), - i + is_dim0, j + is_dim1, - k + is_dim2, l + is_dim3); - int kdx = getIdx(in.strides(), in.offsets(), - i + 2 * is_dim0, j + 2 * is_dim1, - k + 2 * is_dim2, l + 2 * is_dim3); - int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l); - outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; - } - } - } - } - }; - - getQueue().enqueue(func, outArray, in); + getQueue().enqueue(kernel::diff2, outArray, in, dim); return outArray; } diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp index c8b0514610..fe02387102 100644 --- a/src/backend/cpu/fast.cpp +++ b/src/backend/cpu/fast.cpp @@ -16,234 +16,13 @@ #include #include #include +#include using af::dim4; namespace cpu { -inline int clamp(int f, int a, int b) -{ - return std::max(a, std::min(f, b)); -} - -inline int idx_y(int i) -{ - if (i >= 8) - return clamp(-(i-8-4), -3, 3); - - return clamp(i-4, -3, 3); -} - -inline int idx_x(int i) -{ - if (i < 12) - return idx_y(i+4); - - return idx_y(i-12); -} - -inline int idx(int y, int x, unsigned idim0) -{ - return x * idim0 + y; -} - -// test_greater() -// Tests if a pixel x > p + thr -inline int test_greater(float x, float p, float thr) -{ - return (x >= p + thr); -} - -// test_smaller() -// Tests if a pixel x < p - thr -inline int test_smaller(float x, float p, float thr) -{ - return (x <= p - thr); -} - -// test_pixel() -// Returns -1 when x < p - thr -// Returns 0 when x >= p - thr && x <= p + thr -// Returns 1 when x > p + thr -template -inline int test_pixel(const T* image, const float p, float thr, int y, int x, unsigned idim0) -{ - return -test_smaller((float)image[idx(y,x,idim0)], p, thr) | test_greater((float)image[idx(y,x,idim0)], p, thr); -} - -// abs_diff() -// Returns absolute difference of x and y -inline int abs_diff(int x, int y) -{ - return abs(x - y); -} -inline unsigned abs_diff(unsigned x, unsigned y) -{ - return (unsigned)abs((int)x - (int)y); -} -inline float abs_diff(float x, float y) -{ - return fabs(x - y); -} -inline double abs_diff(double x, double y) -{ - return fabs(x - y); -} - -template -void locate_features( - const Array &in, - Array &score, - Array &x_out, - Array &y_out, - Array &score_out, - unsigned* count, - const float thr, - const unsigned arc_length, - const unsigned nonmax, - const unsigned max_feat, - const unsigned edge) -{ - dim4 in_dims = in.dims(); - const T* in_ptr = in.get(); - - for (int y = edge; y < (int)(in_dims[0] - edge); y++) { - for (int x = edge; x < (int)(in_dims[1] - edge); x++) { - float p = in_ptr[idx(y, x, in_dims[0])]; - - // Start by testing opposite pixels of the circle that will result in - // a non-kepoint - int d; - d = test_pixel(in_ptr, p, thr, y-3, x, in_dims[0]) | test_pixel(in_ptr, p, thr, y+3, x, in_dims[0]); - if (d == 0) - continue; - - d &= test_pixel(in_ptr, p, thr, y-2, x+2, in_dims[0]) | test_pixel(in_ptr, p, thr, y+2, x-2, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y , x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y , x-3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y+2, x+2, in_dims[0]) | test_pixel(in_ptr, p, thr, y-2, x-2, in_dims[0]); - if (d == 0) - continue; - - d &= test_pixel(in_ptr, p, thr, y-3, x+1, in_dims[0]) | test_pixel(in_ptr, p, thr, y+3, x-1, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y-1, x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y+1, x-3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y+1, x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y-1, x-3, in_dims[0]); - d &= test_pixel(in_ptr, p, thr, y+3, x+1, in_dims[0]) | test_pixel(in_ptr, p, thr, y-3, x-1, in_dims[0]); - if (d == 0) - continue; - - int sum = 0; - - // Sum responses [-1, 0 or 1] of first arc_length pixels - for (int i = 0; i < static_cast(arc_length); i++) - sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); - - // Test maximum and mininmum responses of first segment of arc_length - // pixels - int max_sum = 0, min_sum = 0; - max_sum = std::max(max_sum, sum); - min_sum = std::min(min_sum, sum); - - // Sum responses and test the remaining 16-arc_length pixels of the circle - for (int i = arc_length; i < 16; i++) { - sum -= test_pixel(in_ptr, p, thr, y+idx_y(i-arc_length), x+idx_x(i-arc_length), in_dims[0]); - sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); - max_sum = std::max(max_sum, sum); - min_sum = std::min(min_sum, sum); - } - - // To completely test all possible segments, it's necessary to test - // segments that include the top junction of the circle - for (int i = 0; i < static_cast(arc_length-1); i++) { - sum -= test_pixel(in_ptr, p, thr, y+idx_y(16-arc_length+i), x+idx_x(16-arc_length+i), in_dims[0]); - sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); - max_sum = std::max(max_sum, sum); - min_sum = std::min(min_sum, sum); - } - - float s_bright = 0, s_dark = 0; - for (int i = 0; i < 16; i++) { - float p_x = (float)in_ptr[idx(y+idx_y(i), x+idx_x(i), in_dims[0])]; - - s_bright += test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr); - s_dark += test_smaller(p_x, p, thr) * (abs_diff(p, p_x) - thr); - } - - // If sum at some point was equal to (+-)arc_length, there is a segment - // that for which all pixels are much brighter or much brighter than - // central pixel p. - if (max_sum == static_cast(arc_length) || min_sum == -static_cast(arc_length)) { - unsigned j = *count; - ++*count; - if (j < max_feat) { - float *x_out_ptr = x_out.get(); - float *y_out_ptr = y_out.get(); - float *score_out_ptr = score_out.get(); - x_out_ptr[j] = static_cast(x); - y_out_ptr[j] = static_cast(y); - score_out_ptr[j] = static_cast(std::max(s_bright, s_dark)); - if (nonmax == 1) { - float* score_ptr = score.get(); - score_ptr[idx(y, x, in_dims[0])] = std::max(s_bright, s_dark); - } - } - } - } - } -} - -void non_maximal( - const Array &score, - const Array &x_in, - const Array &y_in, - Array &x_out, - Array &y_out, - Array &score_out, - unsigned* count, - const unsigned total_feat, - const unsigned edge) -{ - const float *score_ptr = score.get(); - const float *x_in_ptr = x_in.get(); - const float *y_in_ptr = y_in.get(); - - dim4 score_dims = score.dims(); - - for (unsigned k = 0; k < total_feat; k++) { - unsigned x = static_cast(round(x_in_ptr[k])); - unsigned y = static_cast(round(y_in_ptr[k])); - - float v = score_ptr[y + score_dims[0] * x]; - float max_v; - max_v = std::max(score_ptr[y-1 + score_dims[0] * (x-1)], score_ptr[y-1 + score_dims[0] * x]); - max_v = std::max(max_v, score_ptr[y-1 + score_dims[0] * (x+1)]); - max_v = std::max(max_v, score_ptr[y + score_dims[0] * (x-1)]); - max_v = std::max(max_v, score_ptr[y + score_dims[0] * (x+1)]); - max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x-1)]); - max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x) ]); - max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x+1)]); - - if (y >= score_dims[1] - edge - 1 || y <= edge + 1 || - x >= score_dims[0] - edge - 1 || x <= edge + 1) - continue; - - // Stores keypoint to feat_out if it's response is maximum compared to - // its 8-neighborhood - if (v > max_v) { - unsigned j = *count; - ++*count; - - float *x_out_ptr = x_out.get(); - float *y_out_ptr = y_out.get(); - float *score_out_ptr = score_out.get(); - - x_out_ptr[j] = static_cast(x); - y_out_ptr[j] = static_cast(y); - score_out_ptr[j] = static_cast(v); - } - } -} - template unsigned fast(Array &x_out, Array &y_out, Array &score_out, const Array &in, const float thr, const unsigned arc_length, @@ -274,7 +53,7 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, // Feature counter unsigned count = 0; - locate_features(in, V, x, y, score, &count, thr, arc_length, + kernel::locate_features(in, V, x, y, score, &count, thr, arc_length, nonmax, max_feat, edge); // If more features than max_feat were detected, feat wasn't populated @@ -293,7 +72,7 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, score_total = createEmptyArray(feat_found_dims); count = 0; - non_maximal(V, x, y, + kernel::non_maximal(V, x, y, x_total, y_total, score_total, &count, feat_found, edge); diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp index 06c15cff4e..d1a8b0d2c9 100644 --- a/src/backend/cpu/gradient.cpp +++ b/src/backend/cpu/gradient.cpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace cpu { @@ -25,73 +26,7 @@ void gradient(Array &grad0, Array &grad1, const Array &in) grad1.eval(); in.eval(); - auto func = [=] (Array grad0, Array grad1, const Array in) { - const af::dim4 dims = in.dims(); - - T *d_grad0 = grad0.get(); - T *d_grad1 = grad1.get(); - const T *d_in = in.get(); - - const af::dim4 inst = in.strides(); - const af::dim4 g0st = grad0.strides(); - const af::dim4 g1st = grad1.strides(); - - T v5 = scalar(0.5); - T v1 = scalar(1.0); - - for(dim_t idw = 0; idw < dims[3]; idw++) { - const dim_t inW = idw * inst[3]; - const dim_t g0W = idw * g0st[3]; - const dim_t g1W = idw * g1st[3]; - for(dim_t idz = 0; idz < dims[2]; idz++) { - const dim_t inZW = inW + idz * inst[2]; - const dim_t g0ZW = g0W + idz * g0st[2]; - const dim_t g1ZW = g1W + idz * g1st[2]; - dim_t xl, xr, yl,yr; - T f0, f1; - for(dim_t idy = 0; idy < dims[1]; idy++) { - const dim_t inYZW = inZW + idy * inst[1]; - const dim_t g0YZW = g0ZW + idy * g0st[1]; - const dim_t g1YZW = g1ZW + idy * g1st[1]; - if(idy == 0) { - yl = inYZW + inst[1]; - yr = inYZW; - f1 = v1; - } else if(idy == dims[1] - 1) { - yl = inYZW; - yr = inYZW - inst[1]; - f1 = v1; - } else { - yl = inYZW + inst[1]; - yr = inYZW - inst[1]; - f1 = v5; - } - for(dim_t idx = 0; idx < dims[0]; idx++) { - const dim_t inMem = inYZW + idx; - const dim_t g0Mem = g0YZW + idx; - const dim_t g1Mem = g1YZW + idx; - if(idx == 0) { - xl = inMem + 1; - xr = inMem; - f0 = v1; - } else if(idx == dims[0] - 1) { - xl = inMem; - xr = inMem - 1; - f0 = v1; - } else { - xl = inMem + 1; - xr = inMem - 1; - f0 = v5; - } - - d_grad0[g0Mem] = f0 * (d_in[xl] - d_in[xr]); - d_grad1[g1Mem] = f1 * (d_in[yl + idx] - d_in[yr + idx]); - } - } - } - } - }; - getQueue().enqueue(func, grad0, grad1, in); + getQueue().enqueue(kernel::gradient, grad0, grad1, in); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/harris.cpp b/src/backend/cpu/harris.cpp index b57b94025d..e5ff906dd6 100644 --- a/src/backend/cpu/harris.cpp +++ b/src/backend/cpu/harris.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -21,133 +20,13 @@ #include #include #include +#include using af::dim4; namespace cpu { -template -void gaussian1D(T* out, const int dim, double sigma=0.0) -{ - if(!(sigma>0)) sigma = 0.25*dim; - - T sum = (T)0; - for(int i=0;i -void second_order_deriv(Array ixx, Array ixy, Array iyy, - const unsigned in_len, const Array ix, const Array iy) -{ - T* ixx_out = ixx.get(); - T* ixy_out = ixy.get(); - T* iyy_out = iyy.get(); - const T* ix_in = ix.get(); - const T* iy_in = iy.get(); - for (unsigned x = 0; x < in_len; x++) { - ixx_out[x] = ix_in[x] * ix_in[x]; - ixy_out[x] = ix_in[x] * iy_in[x]; - iyy_out[x] = iy_in[x] * iy_in[x]; - } -} - -template -void harris_responses(Array resp, const unsigned idim0, const unsigned idim1, - const Array ixx, const Array ixy, const Array iyy, - const float k_thr, const unsigned border_len) -{ - T* resp_out = resp.get(); - const T* ixx_in = ixx.get(); - const T* ixy_in = ixy.get(); - const T* iyy_in = iyy.get(); - const unsigned r = border_len; - - for (unsigned x = r; x < idim1 - r; x++) { - for (unsigned y = r; y < idim0 - r; y++) { - const unsigned idx = x * idim0 + y; - - // Calculates matrix trace and determinant - T tr = ixx_in[idx] + iyy_in[idx]; - T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx]; - - // Calculates local Harris response - resp_out[idx] = det - k_thr * (tr*tr); - } - } -} - -template -void non_maximal(Array xOut, Array yOut, Array respOut, unsigned* count, - const unsigned idim0, const unsigned idim1, const Array respIn, - const float min_resp, const unsigned border_len, const unsigned max_corners) -{ - float* x_out = xOut.get(); - float* y_out = yOut.get(); - float* resp_out = respOut.get(); - const T* resp_in = respIn.get(); - // Responses on the border don't have 8-neighbors to compare, discard them - const unsigned r = border_len + 1; - - for (unsigned x = r; x < idim1 - r; x++) { - for (unsigned y = r; y < idim0 - r; y++) { - const T v = resp_in[x * idim0 + y]; - - // Find maximum neighborhood response - T max_v; - max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]); - max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]); - max_v = max(max_v, resp_in[(x-1) * idim0 + y ]); - max_v = max(max_v, resp_in[(x+1) * idim0 + y ]); - max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]); - max_v = max(max_v, resp_in[(x) * idim0 + y+1]); - max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]); - - // Stores corner to {x,y,resp}_out if it's response is maximum compared - // to its 8-neighborhood and greater or equal minimum response - if (v > max_v && v >= (T)min_resp) { - const unsigned idx = *count; - *count += 1; - if (idx < max_corners) { - x_out[idx] = (float)x; - y_out[idx] = (float)y; - resp_out[idx] = (float)v; - } - } - } - } -} - -static void keep_corners(Array xOut, Array yOut, Array respOut, - const Array xIn, const Array yIn, - const Array respIn, const Array respIdx, - const unsigned n_corners) -{ - float* x_out = xOut.get(); - float* y_out = yOut.get(); - float* resp_out = respOut.get(); - const float* x_in = xIn.get(); - const float* y_in = yIn.get(); - const float* resp_in = respIn.get(); - const uint* resp_idx = respIdx.get(); - - // Keep only the first n_feat features - for (unsigned f = 0; f < n_corners; f++) { - x_out[f] = x_in[resp_idx[f]]; - y_out[f] = y_in[resp_idx[f]]; - resp_out[f] = resp_in[f]; - } -} - template unsigned harris(Array &x_out, Array &y_out, Array &resp_out, const Array &in, const unsigned max_corners, const float min_response, @@ -164,7 +43,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out for (unsigned i = 0; i < filter_len; i++) h_filter[i] = (T)1.f / (filter_len); } else { - gaussian1D(h_filter, (int)filter_len, sigma); + kernel::gaussian1D(h_filter, (int)filter_len, sigma); } Array filter = createDeviceDataArray(dim4(filter_len), (const void*)h_filter); @@ -181,7 +60,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out Array iyy = createEmptyArray(idims); // Compute second-order derivatives - getQueue().enqueue(second_order_deriv, ixx, ixy, iyy, in.elements(), ix, iy); + getQueue().enqueue(kernel::second_order_deriv, ixx, ixy, iyy, in.elements(), ix, iy); // Convolve second-order derivatives with proper window filter ixx = convolve2(ixx, filter, filter); @@ -192,7 +71,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out Array responses = createEmptyArray(dim4(in.elements())); - getQueue().enqueue(harris_responses, responses, idims[0], idims[1], + getQueue().enqueue(kernel::harris_responses, responses, idims[0], idims[1], ixx, ixy, iyy, k_thr, border_len); Array xCorners = createEmptyArray(dim4(corner_lim)); @@ -204,7 +83,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out // Performs non-maximal suppression getQueue().sync(); unsigned corners_found = 0; - non_maximal(xCorners, yCorners, respCorners, &corners_found, + kernel::non_maximal(xCorners, yCorners, respCorners, &corners_found, idims[0], idims[1], responses, min_r, border_len, corner_lim); const unsigned corners_out = (max_corners > 0) ? @@ -226,7 +105,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out resp_out = createEmptyArray(dim4(corners_out)); // Keep only the corners with higher Harris responses - getQueue().enqueue(keep_corners, x_out, y_out, resp_out, xCorners, yCorners, + getQueue().enqueue(kernel::keep_corners, x_out, y_out, resp_out, xCorners, yCorners, harris_sorted, harris_idx, corners_out); } else if (max_corners == 0 && corners_found < corner_lim) { x_out = createEmptyArray(dim4(corners_out)); diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp index 8fb3e43544..7e20247231 100644 --- a/src/backend/cpu/histogram.cpp +++ b/src/backend/cpu/histogram.cpp @@ -14,6 +14,7 @@ #include #include #include +#include using af::dim4; @@ -21,7 +22,8 @@ namespace cpu { template -Array histogram(const Array &in, const unsigned &nbins, const double &minval, const double &maxval) +Array histogram(const Array &in, + const unsigned &nbins, const double &minval, const double &maxval) { in.eval(); @@ -30,32 +32,8 @@ Array histogram(const Array &in, const unsigned &nbins, const d Array out = createValueArray(outDims, outType(0)); out.eval(); - auto func = [=](Array out, const Array in, - const unsigned nbins, const double minval, const double maxval) { - const float step = (maxval - minval)/(float)nbins; - const dim4 inDims = in.dims(); - const dim4 iStrides = in.strides(); - const dim4 oStrides = out.strides(); - const dim_t nElems = inDims[0]*inDims[1]; - - outType *outData = out.get(); - const inType* inData= in.get(); - - for(dim_t b3 = 0; b3 < outDims[3]; b3++) { - for(dim_t b2 = 0; b2 < outDims[2]; b2++) { - for(dim_t i=0; i, + out, in, nbins, minval, maxval); return out; } diff --git a/src/backend/cpu/kernel/diff.hpp b/src/backend/cpu/kernel/diff.hpp new file mode 100644 index 0000000000..e0693b1349 --- /dev/null +++ b/src/backend/cpu/kernel/diff.hpp @@ -0,0 +1,91 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0) +{ + return (l * strides[3] + k * strides[2] + j * strides[1] + i); +} + + +template +void diff1(Array out, Array const in, int const dim) +{ + af::dim4 dims = out.dims(); + // Bool for dimension + bool is_dim0 = dim == 0; + bool is_dim1 = dim == 1; + bool is_dim2 = dim == 2; + bool is_dim3 = dim == 3; + + // Get pointers to raw data + const T *inPtr = in.get(); + T *outPtr = out.get(); + + // TODO: Improve this + for(dim_t l = 0; l < dims[3]; l++) { + for(dim_t k = 0; k < dims[2]; k++) { + for(dim_t j = 0; j < dims[1]; j++) { + for(dim_t i = 0; i < dims[0]; i++) { + // Operation: out[index] = in[index + 1 * dim_size] - in[index] + int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); + int jdx = getIdx(in.strides(), in.offsets(), + i + is_dim0, j + is_dim1, + k + is_dim2, l + is_dim3); + int odx = getIdx(out.strides(), out.offsets(), i, j, k, l); + outPtr[odx] = inPtr[jdx] - inPtr[idx]; + } + } + } + } +} + +template +void diff2(Array out, Array const in, int const dim) +{ + af::dim4 dims = out.dims(); + // Bool for dimension + bool is_dim0 = dim == 0; + bool is_dim1 = dim == 1; + bool is_dim2 = dim == 2; + bool is_dim3 = dim == 3; + + // Get pointers to raw data + const T *inPtr = in.get(); + T *outPtr = out.get(); + + // TODO: Improve this + for(dim_t l = 0; l < dims[3]; l++) { + for(dim_t k = 0; k < dims[2]; k++) { + for(dim_t j = 0; j < dims[1]; j++) { + for(dim_t i = 0; i < dims[0]; i++) { + // Operation: out[index] = in[index + 1 * dim_size] - in[index] + int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); + int jdx = getIdx(in.strides(), in.offsets(), + i + is_dim0, j + is_dim1, + k + is_dim2, l + is_dim3); + int kdx = getIdx(in.strides(), in.offsets(), + i + 2 * is_dim0, j + 2 * is_dim1, + k + 2 * is_dim2, l + 2 * is_dim3); + int odx = getIdx(out.strides(), out.offsets(), i, j, k, l); + outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/fast.hpp b/src/backend/cpu/kernel/fast.hpp new file mode 100644 index 0000000000..a3971dd136 --- /dev/null +++ b/src/backend/cpu/kernel/fast.hpp @@ -0,0 +1,228 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +inline int clamp(int f, int a, int b) +{ + return std::max(a, std::min(f, b)); +} + +inline int idx_y(int i) +{ + if (i >= 8) + return clamp(-(i-8-4), -3, 3); + + return clamp(i-4, -3, 3); +} + +inline int idx_x(int i) +{ + if (i < 12) + return idx_y(i+4); + + return idx_y(i-12); +} + +inline int idx(int y, int x, unsigned idim0) +{ + return x * idim0 + y; +} + +// test_greater() +// Tests if a pixel x > p + thr +inline int test_greater(float x, float p, float thr) +{ + return (x >= p + thr); +} + +// test_smaller() +// Tests if a pixel x < p - thr +inline int test_smaller(float x, float p, float thr) +{ + return (x <= p - thr); +} + +// test_pixel() +// Returns -1 when x < p - thr +// Returns 0 when x >= p - thr && x <= p + thr +// Returns 1 when x > p + thr +template +inline int test_pixel(const T* image, const float p, float thr, int y, int x, unsigned idim0) +{ + return -test_smaller((float)image[idx(y,x,idim0)], p, thr) | test_greater((float)image[idx(y,x,idim0)], p, thr); +} + +// abs_diff() +// Returns absolute difference of x and y +inline int abs_diff(int x, int y) +{ + return abs(x - y); +} +inline unsigned abs_diff(unsigned x, unsigned y) +{ + return (unsigned)abs((int)x - (int)y); +} +inline float abs_diff(float x, float y) +{ + return fabs(x - y); +} +inline double abs_diff(double x, double y) +{ + return fabs(x - y); +} + +template +void locate_features(const Array &in, Array &score, + Array &x_out, Array &y_out, + Array &score_out, unsigned* count, const float thr, + const unsigned arc_length, const unsigned nonmax, + const unsigned max_feat, const unsigned edge) +{ + dim4 in_dims = in.dims(); + const T* in_ptr = in.get(); + + for (int y = edge; y < (int)(in_dims[0] - edge); y++) { + for (int x = edge; x < (int)(in_dims[1] - edge); x++) { + float p = in_ptr[idx(y, x, in_dims[0])]; + + // Start by testing opposite pixels of the circle that will result in + // a non-kepoint + int d; + d = test_pixel(in_ptr, p, thr, y-3, x, in_dims[0]) | test_pixel(in_ptr, p, thr, y+3, x, in_dims[0]); + if (d == 0) + continue; + + d &= test_pixel(in_ptr, p, thr, y-2, x+2, in_dims[0]) | test_pixel(in_ptr, p, thr, y+2, x-2, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y , x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y , x-3, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y+2, x+2, in_dims[0]) | test_pixel(in_ptr, p, thr, y-2, x-2, in_dims[0]); + if (d == 0) + continue; + + d &= test_pixel(in_ptr, p, thr, y-3, x+1, in_dims[0]) | test_pixel(in_ptr, p, thr, y+3, x-1, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y-1, x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y+1, x-3, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y+1, x+3, in_dims[0]) | test_pixel(in_ptr, p, thr, y-1, x-3, in_dims[0]); + d &= test_pixel(in_ptr, p, thr, y+3, x+1, in_dims[0]) | test_pixel(in_ptr, p, thr, y-3, x-1, in_dims[0]); + if (d == 0) + continue; + + int sum = 0; + + // Sum responses [-1, 0 or 1] of first arc_length pixels + for (int i = 0; i < static_cast(arc_length); i++) + sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); + + // Test maximum and mininmum responses of first segment of arc_length + // pixels + int max_sum = 0, min_sum = 0; + max_sum = std::max(max_sum, sum); + min_sum = std::min(min_sum, sum); + + // Sum responses and test the remaining 16-arc_length pixels of the circle + for (int i = arc_length; i < 16; i++) { + sum -= test_pixel(in_ptr, p, thr, y+idx_y(i-arc_length), x+idx_x(i-arc_length), in_dims[0]); + sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); + max_sum = std::max(max_sum, sum); + min_sum = std::min(min_sum, sum); + } + + // To completely test all possible segments, it's necessary to test + // segments that include the top junction of the circle + for (int i = 0; i < static_cast(arc_length-1); i++) { + sum -= test_pixel(in_ptr, p, thr, y+idx_y(16-arc_length+i), x+idx_x(16-arc_length+i), in_dims[0]); + sum += test_pixel(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]); + max_sum = std::max(max_sum, sum); + min_sum = std::min(min_sum, sum); + } + + float s_bright = 0, s_dark = 0; + for (int i = 0; i < 16; i++) { + float p_x = (float)in_ptr[idx(y+idx_y(i), x+idx_x(i), in_dims[0])]; + + s_bright += test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr); + s_dark += test_smaller(p_x, p, thr) * (abs_diff(p, p_x) - thr); + } + + // If sum at some point was equal to (+-)arc_length, there is a segment + // that for which all pixels are much brighter or much brighter than + // central pixel p. + if (max_sum == static_cast(arc_length) || min_sum == -static_cast(arc_length)) { + unsigned j = *count; + ++*count; + if (j < max_feat) { + float *x_out_ptr = x_out.get(); + float *y_out_ptr = y_out.get(); + float *score_out_ptr = score_out.get(); + x_out_ptr[j] = static_cast(x); + y_out_ptr[j] = static_cast(y); + score_out_ptr[j] = static_cast(std::max(s_bright, s_dark)); + if (nonmax == 1) { + float* score_ptr = score.get(); + score_ptr[idx(y, x, in_dims[0])] = std::max(s_bright, s_dark); + } + } + } + } + } +} + +void non_maximal(const Array &score, const Array &x_in, const Array &y_in, + Array &x_out, Array &y_out, Array &score_out, + unsigned* count, const unsigned total_feat, const unsigned edge) +{ + const float *score_ptr = score.get(); + const float *x_in_ptr = x_in.get(); + const float *y_in_ptr = y_in.get(); + + dim4 score_dims = score.dims(); + + for (unsigned k = 0; k < total_feat; k++) { + unsigned x = static_cast(round(x_in_ptr[k])); + unsigned y = static_cast(round(y_in_ptr[k])); + + float v = score_ptr[y + score_dims[0] * x]; + float max_v; + max_v = std::max(score_ptr[y-1 + score_dims[0] * (x-1)], score_ptr[y-1 + score_dims[0] * x]); + max_v = std::max(max_v, score_ptr[y-1 + score_dims[0] * (x+1)]); + max_v = std::max(max_v, score_ptr[y + score_dims[0] * (x-1)]); + max_v = std::max(max_v, score_ptr[y + score_dims[0] * (x+1)]); + max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x-1)]); + max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x) ]); + max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x+1)]); + + if (y >= score_dims[1] - edge - 1 || y <= edge + 1 || + x >= score_dims[0] - edge - 1 || x <= edge + 1) + continue; + + // Stores keypoint to feat_out if it's response is maximum compared to + // its 8-neighborhood + if (v > max_v) { + unsigned j = *count; + ++*count; + + float *x_out_ptr = x_out.get(); + float *y_out_ptr = y_out.get(); + float *score_out_ptr = score_out.get(); + + x_out_ptr[j] = static_cast(x); + y_out_ptr[j] = static_cast(y); + score_out_ptr[j] = static_cast(v); + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/gradient.hpp b/src/backend/cpu/kernel/gradient.hpp new file mode 100644 index 0000000000..c152fb343a --- /dev/null +++ b/src/backend/cpu/kernel/gradient.hpp @@ -0,0 +1,87 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +template +void gradient(Array grad0, Array grad1, Array const in) +{ + const af::dim4 dims = in.dims(); + + T *d_grad0 = grad0.get(); + T *d_grad1 = grad1.get(); + const T *d_in = in.get(); + + const af::dim4 inst = in.strides(); + const af::dim4 g0st = grad0.strides(); + const af::dim4 g1st = grad1.strides(); + + T v5 = scalar(0.5); + T v1 = scalar(1.0); + + for(dim_t idw = 0; idw < dims[3]; idw++) { + const dim_t inW = idw * inst[3]; + const dim_t g0W = idw * g0st[3]; + const dim_t g1W = idw * g1st[3]; + for(dim_t idz = 0; idz < dims[2]; idz++) { + const dim_t inZW = inW + idz * inst[2]; + const dim_t g0ZW = g0W + idz * g0st[2]; + const dim_t g1ZW = g1W + idz * g1st[2]; + dim_t xl, xr, yl,yr; + T f0, f1; + for(dim_t idy = 0; idy < dims[1]; idy++) { + const dim_t inYZW = inZW + idy * inst[1]; + const dim_t g0YZW = g0ZW + idy * g0st[1]; + const dim_t g1YZW = g1ZW + idy * g1st[1]; + if(idy == 0) { + yl = inYZW + inst[1]; + yr = inYZW; + f1 = v1; + } else if(idy == dims[1] - 1) { + yl = inYZW; + yr = inYZW - inst[1]; + f1 = v1; + } else { + yl = inYZW + inst[1]; + yr = inYZW - inst[1]; + f1 = v5; + } + for(dim_t idx = 0; idx < dims[0]; idx++) { + const dim_t inMem = inYZW + idx; + const dim_t g0Mem = g0YZW + idx; + const dim_t g1Mem = g1YZW + idx; + if(idx == 0) { + xl = inMem + 1; + xr = inMem; + f0 = v1; + } else if(idx == dims[0] - 1) { + xl = inMem; + xr = inMem - 1; + f0 = v1; + } else { + xl = inMem + 1; + xr = inMem - 1; + f0 = v5; + } + + d_grad0[g0Mem] = f0 * (d_in[xl] - d_in[xr]); + d_grad1[g1Mem] = f1 * (d_in[yl + idx] - d_in[yr + idx]); + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/harris.hpp b/src/backend/cpu/kernel/harris.hpp new file mode 100644 index 0000000000..db6551bbde --- /dev/null +++ b/src/backend/cpu/kernel/harris.hpp @@ -0,0 +1,139 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +template +void gaussian1D(T* out, const int dim, double sigma=0.0) +{ + if(!(sigma>0)) sigma = 0.25*dim; + + T sum = (T)0; + for(int i=0;i +void second_order_deriv(Array ixx, Array ixy, Array iyy, + const unsigned in_len, const Array ix, const Array iy) +{ + T* ixx_out = ixx.get(); + T* ixy_out = ixy.get(); + T* iyy_out = iyy.get(); + const T* ix_in = ix.get(); + const T* iy_in = iy.get(); + for (unsigned x = 0; x < in_len; x++) { + ixx_out[x] = ix_in[x] * ix_in[x]; + ixy_out[x] = ix_in[x] * iy_in[x]; + iyy_out[x] = iy_in[x] * iy_in[x]; + } +} + +template +void harris_responses(Array resp, const unsigned idim0, const unsigned idim1, + const Array ixx, const Array ixy, const Array iyy, + const float k_thr, const unsigned border_len) +{ + T* resp_out = resp.get(); + const T* ixx_in = ixx.get(); + const T* ixy_in = ixy.get(); + const T* iyy_in = iyy.get(); + const unsigned r = border_len; + + for (unsigned x = r; x < idim1 - r; x++) { + for (unsigned y = r; y < idim0 - r; y++) { + const unsigned idx = x * idim0 + y; + + // Calculates matrix trace and determinant + T tr = ixx_in[idx] + iyy_in[idx]; + T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx]; + + // Calculates local Harris response + resp_out[idx] = det - k_thr * (tr*tr); + } + } +} + +template +void non_maximal(Array xOut, Array yOut, Array respOut, unsigned* count, + const unsigned idim0, const unsigned idim1, const Array respIn, + const float min_resp, const unsigned border_len, const unsigned max_corners) +{ + float* x_out = xOut.get(); + float* y_out = yOut.get(); + float* resp_out = respOut.get(); + const T* resp_in = respIn.get(); + // Responses on the border don't have 8-neighbors to compare, discard them + const unsigned r = border_len + 1; + + for (unsigned x = r; x < idim1 - r; x++) { + for (unsigned y = r; y < idim0 - r; y++) { + const T v = resp_in[x * idim0 + y]; + + // Find maximum neighborhood response + T max_v; + max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]); + max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]); + max_v = max(max_v, resp_in[(x-1) * idim0 + y ]); + max_v = max(max_v, resp_in[(x+1) * idim0 + y ]); + max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]); + max_v = max(max_v, resp_in[(x) * idim0 + y+1]); + max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]); + + // Stores corner to {x,y,resp}_out if it's response is maximum compared + // to its 8-neighborhood and greater or equal minimum response + if (v > max_v && v >= (T)min_resp) { + const unsigned idx = *count; + *count += 1; + if (idx < max_corners) { + x_out[idx] = (float)x; + y_out[idx] = (float)y; + resp_out[idx] = (float)v; + } + } + } + } +} + +static void keep_corners(Array xOut, Array yOut, Array respOut, + const Array xIn, const Array yIn, + const Array respIn, const Array respIdx, + const unsigned n_corners) +{ + float* x_out = xOut.get(); + float* y_out = yOut.get(); + float* resp_out = respOut.get(); + const float* x_in = xIn.get(); + const float* y_in = yIn.get(); + const float* resp_in = respIn.get(); + const uint* resp_idx = respIdx.get(); + + // Keep only the first n_feat features + for (unsigned f = 0; f < n_corners; f++) { + x_out[f] = x_in[resp_idx[f]]; + y_out[f] = y_in[resp_idx[f]]; + resp_out[f] = resp_in[f]; + } +} + +} +} diff --git a/src/backend/cpu/kernel/histogram.hpp b/src/backend/cpu/kernel/histogram.hpp new file mode 100644 index 0000000000..e26965aa04 --- /dev/null +++ b/src/backend/cpu/kernel/histogram.hpp @@ -0,0 +1,47 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +template +void histogram(Array out, Array const in, + unsigned const nbins, double const minval, double const maxval) +{ + dim4 const outDims = out.dims(); + float const step = (maxval - minval)/(float)nbins; + dim4 const inDims = in.dims(); + dim4 const iStrides = in.strides(); + dim4 const oStrides = out.strides(); + dim_t const nElems = inDims[0]*inDims[1]; + + outType *outData = out.get(); + const inType* inData= in.get(); + + for(dim_t b3 = 0; b3 < outDims[3]; b3++) { + for(dim_t b2 = 0; b2 < outDims[2]; b2++) { + for(dim_t i=0; i Date: Sat, 19 Dec 2015 11:21:36 -0500 Subject: [PATCH 082/288] moved rgb_hsv & identity fns to kernel namespace --- src/backend/cpu/hsv_rgb.cpp | 104 +---------------------- src/backend/cpu/identity.cpp | 19 +---- src/backend/cpu/kernel/hsv_rgb.hpp | 124 ++++++++++++++++++++++++++++ src/backend/cpu/kernel/identity.hpp | 37 +++++++++ 4 files changed, 166 insertions(+), 118 deletions(-) create mode 100644 src/backend/cpu/kernel/hsv_rgb.hpp create mode 100644 src/backend/cpu/kernel/identity.hpp diff --git a/src/backend/cpu/hsv_rgb.cpp b/src/backend/cpu/hsv_rgb.cpp index d20416f3c9..c0f19db773 100644 --- a/src/backend/cpu/hsv_rgb.cpp +++ b/src/backend/cpu/hsv_rgb.cpp @@ -11,10 +11,9 @@ #include #include #include -#include -#include #include #include +#include using af::dim4; @@ -28,56 +27,7 @@ Array hsv2rgb(const Array& in) Array out = createEmptyArray(in.dims()); - auto func = [=](Array out, const Array in) { - const dim4 dims = in.dims(); - const dim4 strides = in.strides(); - dim_t obStride = out.strides()[3]; - dim_t coff = strides[2]; - dim_t bCount = dims[3]; - - for(dim_t b=0; b, out, in); return out; } @@ -89,55 +39,7 @@ Array rgb2hsv(const Array& in) Array out = createEmptyArray(in.dims()); - auto func = [=](Array out, const Array in) { - const dim4 dims = in.dims(); - const dim4 strides = in.strides(); - dim4 oStrides = out.strides(); - dim_t bCount = dims[3]; - - for(dim_t b=0; b, out, in); return out; } diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp index 55c441755c..949fceda81 100644 --- a/src/backend/cpu/identity.cpp +++ b/src/backend/cpu/identity.cpp @@ -7,14 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include #include -#include #include #include -#include #include #include +#include namespace cpu { @@ -24,20 +22,7 @@ Array identity(const dim4& dims) { Array out = createEmptyArray(dims); - auto func = [=] (Array out) { - T *ptr = out.get(); - const dim_t *out_dims = out.dims().get(); - - for (dim_t k = 0; k < out_dims[2] * out_dims[3]; k++) { - for (dim_t j = 0; j < out_dims[1]; j++) { - for (dim_t i = 0; i < out_dims[0]; i++) { - ptr[j * out_dims[0] + i] = (i == j) ? scalar(1) : scalar(0); - } - } - ptr += out_dims[0] * out_dims[1]; - } - }; - getQueue().enqueue(func, out); + getQueue().enqueue(kernel::identity, out); return out; } diff --git a/src/backend/cpu/kernel/hsv_rgb.hpp b/src/backend/cpu/kernel/hsv_rgb.hpp new file mode 100644 index 0000000000..d8aa954df7 --- /dev/null +++ b/src/backend/cpu/kernel/hsv_rgb.hpp @@ -0,0 +1,124 @@ +/******************************************************* +* Copyright (c) 2015, ArrayFire +* All rights reserved. +* +* This file is distributed under 3-clause BSD license. +* The complete license agreement can be obtained at: +* http://arrayfire.com/licenses/BSD-3-Clause +********************************************************/ + +#include +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void hsv2rgb(Array out, Array const in) +{ + const dim4 dims = in.dims(); + const dim4 strides = in.strides(); + dim_t obStride = out.strides()[3]; + dim_t coff = strides[2]; + dim_t bCount = dims[3]; + + for(dim_t b=0; b +void rgb2hsv(Array out, Array const in) +{ + const dim4 dims = in.dims(); + const dim4 strides = in.strides(); + dim4 oStrides = out.strides(); + dim_t bCount = dims[3]; + + for(dim_t b=0; b +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void identity(Array out) +{ + T *ptr = out.get(); + const dim4 out_dims = out.dims(); + + for (dim_t k = 0; k < out_dims[2] * out_dims[3]; k++) { + for (dim_t j = 0; j < out_dims[1]; j++) { + for (dim_t i = 0; i < out_dims[0]; i++) { + ptr[j * out_dims[0] + i] = (i == j) ? scalar(1) : scalar(0); + } + } + ptr += out_dims[0] * out_dims[1]; + } +} + +} +} From 696657cb3cde7d660b0558269c85fa24cdda2f6d Mon Sep 17 00:00:00 2001 From: pradeep Date: Sat, 19 Dec 2015 11:47:51 -0500 Subject: [PATCH 083/288] moved indexing & assignment fns to kernel namespace Also moved the common utility function trimIndex to a common location. --- src/backend/cpu/iir.cpp | 44 +------------------ src/backend/cpu/index.cpp | 69 ++---------------------------- src/backend/cpu/kernel/assign.hpp | 21 ++------- src/backend/cpu/kernel/iir.hpp | 61 ++++++++++++++++++++++++++ src/backend/cpu/kernel/index.hpp | 71 +++++++++++++++++++++++++++++++ src/backend/cpu/kernel/lookup.hpp | 62 +++++++++++++++++++++++++++ src/backend/cpu/lookup.cpp | 54 +---------------------- src/backend/cpu/utility.hpp | 30 +++++++++++++ 8 files changed, 236 insertions(+), 176 deletions(-) create mode 100644 src/backend/cpu/kernel/iir.hpp create mode 100644 src/backend/cpu/kernel/index.hpp create mode 100644 src/backend/cpu/kernel/lookup.hpp create mode 100644 src/backend/cpu/utility.hpp diff --git a/src/backend/cpu/iir.cpp b/src/backend/cpu/iir.cpp index 3c06275f5a..225f39b859 100644 --- a/src/backend/cpu/iir.cpp +++ b/src/backend/cpu/iir.cpp @@ -12,12 +12,10 @@ #include #include #include -#include -#include -#include #include #include #include +#include using af::dim4; @@ -44,45 +42,7 @@ Array iir(const Array &b, const Array &a, const Array &x) Array y = createEmptyArray(c.dims()); - auto func = [=] (Array y, Array c, const Array a) { - dim4 ydims = c.dims(); - int num_a = a.dims()[0]; - - for (int l = 0; l < (int)ydims[3]; l++) { - dim_t yidx3 = l * y.strides()[3]; - dim_t cidx3 = l * c.strides()[3]; - dim_t aidx3 = l * a.strides()[3]; - - for (int k = 0; k < (int)ydims[2]; k++) { - - dim_t yidx2 = k * y.strides()[2] + yidx3; - dim_t cidx2 = k * c.strides()[2] + cidx3; - dim_t aidx2 = k * a.strides()[2] + aidx3; - - for (int j = 0; j < (int)ydims[1]; j++) { - - dim_t yidx1 = j * y.strides()[1] + yidx2; - dim_t cidx1 = j * c.strides()[1] + cidx2; - dim_t aidx1 = j * a.strides()[1] + aidx2; - - std::vector h_z(num_a); - - const T *h_a = a.get() + (a.ndims() > 1 ? aidx1 : 0); - T *h_c = c.get() + cidx1; - T *h_y = y.get() + yidx1; - - for (int i = 0; i < (int)ydims[0]; i++) { - - T y = h_y[i] = (h_c[i] + h_z[0]) / h_a[0]; - for (int ii = 1; ii < num_a; ii++) { - h_z[ii - 1] = h_z[ii] - h_a[ii] * y; - } - } - } - } - } - }; - getQueue().enqueue(func, y, c, a); + getQueue().enqueue(kernel::iir, y, c, a); return y; } diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp index 68c2f16a23..bd569de44a 100644 --- a/src/backend/cpu/index.cpp +++ b/src/backend/cpu/index.cpp @@ -13,11 +13,11 @@ #include #include #include -#include #include #include #include #include +#include using std::vector; using af::dim4; @@ -25,19 +25,6 @@ using af::dim4; namespace cpu { -static inline -dim_t trimIndex(dim_t idx, const dim_t &len) -{ - dim_t ret_val = idx; - dim_t offset = abs(ret_val)%len; - if (ret_val<0) { - ret_val = offset-1; - } else if (ret_val>=len) { - ret_val = len-offset-1; - } - return ret_val; -} - template Array index(const Array& in, const af_index_t idxrs[]) { @@ -47,7 +34,7 @@ Array index(const Array& in, const af_index_t idxrs[]) vector seqs(4, af_span); // create seq vector to retrieve output // dimensions, offsets & offsets - for (dim_t x=0; x index(const Array& in, const af_index_t idxrs[]) vector< Array > idxArrs(4, createEmptyArray(dim4())); // look through indexs to read af_array indexs - for (dim_t x=0; x(idxrs[x].idx.arr); idxArrs[x].eval(); @@ -70,56 +57,8 @@ Array index(const Array& in, const af_index_t idxrs[]) Array out = createEmptyArray(oDims); - auto func = [=] (Array out, const Array in, - const vector isSeq, - const vector seqs, - const vector< Array > idxArrs) { - - const dim4 iDims = in.dims(); - const dim4 dDims = in.getDataDims(); - const dim4 iOffs = toOffset(seqs, dDims); - const dim4 iStrds = toStride(seqs, dDims); - const dim4 oDims = out.dims(); - const dim4 oStrides = out.strides(); - const T *src = in.get(); - T *dst = out.get(); - const uint* ptr0 = idxArrs[0].get(); - const uint* ptr1 = idxArrs[1].get(); - const uint* ptr2 = idxArrs[2].get(); - const uint* ptr3 = idxArrs[3].get(); - - for (dim_t l=0; l, out, in, std::move(isSeq), std::move(seqs), std::move(idxArrs)); return out; } diff --git a/src/backend/cpu/kernel/assign.hpp b/src/backend/cpu/kernel/assign.hpp index 2621ba741f..83f48e9f75 100644 --- a/src/backend/cpu/kernel/assign.hpp +++ b/src/backend/cpu/kernel/assign.hpp @@ -7,8 +7,9 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include #include +#include +#include namespace cpu { @@ -16,24 +17,10 @@ namespace kernel { using af::dim4; -using std::vector; - -inline -dim_t trimIndex(int idx, const dim_t &len) -{ - int ret_val = idx; - int offset = abs(ret_val)%len; - if (ret_val<0) { - ret_val = offset-1; - } else if (ret_val>=(int)len) { - ret_val = len-offset-1; - } - return ret_val; -} template -void assign(Array out, const Array rhs, const vector isSeq, - const vector seqs, const vector< Array > idxArrs) +void assign(Array out, const Array rhs, const std::vector isSeq, + const std::vector seqs, const std::vector< Array > idxArrs) { dim4 dDims = out.getDataDims(); dim4 pDims = out.dims(); diff --git a/src/backend/cpu/kernel/iir.hpp b/src/backend/cpu/kernel/iir.hpp new file mode 100644 index 0000000000..d1ca464365 --- /dev/null +++ b/src/backend/cpu/kernel/iir.hpp @@ -0,0 +1,61 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void iir(Array y, Array c, Array const a) +{ + dim4 ydims = c.dims(); + int num_a = a.dims()[0]; + + for (int l = 0; l < (int)ydims[3]; l++) { + dim_t yidx3 = l * y.strides()[3]; + dim_t cidx3 = l * c.strides()[3]; + dim_t aidx3 = l * a.strides()[3]; + + for (int k = 0; k < (int)ydims[2]; k++) { + + dim_t yidx2 = k * y.strides()[2] + yidx3; + dim_t cidx2 = k * c.strides()[2] + cidx3; + dim_t aidx2 = k * a.strides()[2] + aidx3; + + for (int j = 0; j < (int)ydims[1]; j++) { + + dim_t yidx1 = j * y.strides()[1] + yidx2; + dim_t cidx1 = j * c.strides()[1] + cidx2; + dim_t aidx1 = j * a.strides()[1] + aidx2; + + std::vector h_z(num_a); + + const T *h_a = a.get() + (a.ndims() > 1 ? aidx1 : 0); + T *h_c = c.get() + cidx1; + T *h_y = y.get() + yidx1; + + for (int i = 0; i < (int)ydims[0]; i++) { + + T y = h_y[i] = (h_c[i] + h_z[0]) / h_a[0]; + for (int ii = 1; ii < num_a; ii++) { + h_z[ii - 1] = h_z[ii] - h_a[ii] * y; + } + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/index.hpp b/src/backend/cpu/kernel/index.hpp new file mode 100644 index 0000000000..ee20c24d44 --- /dev/null +++ b/src/backend/cpu/kernel/index.hpp @@ -0,0 +1,71 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void index(Array out, Array const in, + std::vector const isSeq, std::vector const seqs, + std::vector< Array > const idxArrs) +{ + const dim4 iDims = in.dims(); + const dim4 dDims = in.getDataDims(); + const dim4 iOffs = toOffset(seqs, dDims); + const dim4 iStrds = toStride(seqs, dDims); + const dim4 oDims = out.dims(); + const dim4 oStrides = out.strides(); + const T *src = in.get(); + T *dst = out.get(); + const uint* ptr0 = idxArrs[0].get(); + const uint* ptr1 = idxArrs[1].get(); + const uint* ptr2 = idxArrs[2].get(); + const uint* ptr3 = idxArrs[3].get(); + + for (dim_t l=0; l +#include +#include + +namespace cpu +{ +namespace kernel +{ + +using af::dim4; + +template +void lookup(Array out, Array const input, + Array const indices, unsigned const dim) +{ + const dim4 iDims = input.dims(); + const dim4 oDims = out.dims(); + const dim4 iStrides = input.strides(); + const dim4 oStrides = out.strides(); + const in_t *inPtr = input.get(); + const idx_t *idxPtr = indices.get(); + + in_t *outPtr = out.get(); + + for (dim_t l=0; l -#include #include #include #include +#include namespace cpu { -static inline -dim_t trimIndex(int idx, const dim_t &len) -{ - int ret_val = idx; - int offset = abs(ret_val)%len; - if (ret_val<0) { - ret_val = offset-1; - } else if (ret_val>=len) { - ret_val = len-offset-1; - } - return ret_val; -} - template Array lookup(const Array &input, const Array &indices, const unsigned dim) { @@ -43,44 +30,7 @@ Array lookup(const Array &input, const Array &indices, const Array out = createEmptyArray(oDims); - auto func = [=] (Array out, const Array input, - const Array indices, const unsigned dim) { - const dim4 iDims = input.dims(); - const dim4 oDims = out.dims(); - const dim4 iStrides = input.strides(); - const dim4 oStrides = out.strides(); - const in_t *inPtr = input.get(); - const idx_t *idxPtr = indices.get(); - - in_t *outPtr = out.get(); - - for (dim_t l=0; l, out, input, indices, dim); return out; } diff --git a/src/backend/cpu/utility.hpp b/src/backend/cpu/utility.hpp new file mode 100644 index 0000000000..18a38f3149 --- /dev/null +++ b/src/backend/cpu/utility.hpp @@ -0,0 +1,30 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include "backend.hpp" + +namespace cpu +{ + +static inline +dim_t trimIndex(const int &idx, const dim_t &len) +{ + int ret_val = idx; + int offset = abs(ret_val)%len; + if (ret_val<0) { + ret_val = offset-1; + } else if (ret_val>=(int)len) { + ret_val = len-offset-1; + } + return ret_val; +} + +} From edda52acad309e5ff672f2eb62289a1d311e367f Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Sat, 19 Dec 2015 13:07:50 -0500 Subject: [PATCH 084/288] Update README.md with updated status badges --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 695adbed03..f43b9fd098 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,8 @@ ArrayFire binary installers can be downloaded at the [ArrayFire Downloads](http: ### Build Status | | Linux x86 | Linux armv7l | Linux aarch64 | Windows | OSX | |:-------:|:---------:|:------------:|:-------------:|:-------:|:---:| -| Build | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/devel)](http://ci.arrayfire.org/job/arrayfire-linux/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/devel)](http://ci.arrayfire.org/job/arrayfire-windows/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/devel)](http://ci.arrayfire.org/job/arrayfire-osx/branch/devel/) | -| Test | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux-test/devel)](http://ci.arrayfire.org/job/arrayfire-linux-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1-test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows-test/devel)](http://ci.arrayfire.org/job/arrayfire-windows-test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx-test/devel)](http://ci.arrayfire.org/job/arrayfire-osx-test/branch/devel/) | +| Build | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/build/devel)](http://ci.arrayfire.org/job/arrayfire-linux/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/build/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/build/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/build/devel)](http://ci.arrayfire.org/job/arrayfire-windows/job/build/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/build/devel)](http://ci.arrayfire.org/job/arrayfire-osx/job/build/branch/devel/) | +| Test | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-linux/test/devel)](http://ci.arrayfire.org/job/arrayfire-linux/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrak1/test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrak1/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-tegrax1/test/devel)](http://ci.arrayfire.org/job/arrayfire-tegrax1/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-windows/test/devel)](http://ci.arrayfire.org/job/arrayfire-windows/job/test/branch/devel/) | [![Build Status](http://ci.arrayfire.org/buildStatus/icon?job=arrayfire-osx/test/devel)](http://ci.arrayfire.org/job/arrayfire-osx/job/test/branch/devel/) | Test coverage: [![Coverage Status](https://coveralls.io/repos/arrayfire/arrayfire/badge.svg?branch=HEAD)](https://coveralls.io/r/arrayfire/arrayfire?branch=HEAD) From f2b84dd3ac65aea385bd0ac1a69aa0255e9b7169 Mon Sep 17 00:00:00 2001 From: pradeep Date: Sat, 19 Dec 2015 13:19:32 -0500 Subject: [PATCH 085/288] template parameters style fixes in cpu kernel namespace fns --- src/backend/cpu/bilateral.cpp | 2 +- src/backend/cpu/copy.cpp | 4 +- src/backend/cpu/harris.cpp | 2 +- src/backend/cpu/histogram.cpp | 2 +- src/backend/cpu/kernel/Array.hpp | 8 +- src/backend/cpu/kernel/approx1.hpp | 110 +++++++++--------- src/backend/cpu/kernel/approx2.hpp | 130 ++++++++++----------- src/backend/cpu/kernel/assign.hpp | 28 ++--- src/backend/cpu/kernel/bilateral.hpp | 55 ++++----- src/backend/cpu/kernel/convolve.hpp | 154 ++++++++++++------------- src/backend/cpu/kernel/copy.hpp | 28 ++--- src/backend/cpu/kernel/diagonal.hpp | 14 +-- src/backend/cpu/kernel/diff.hpp | 33 +++--- src/backend/cpu/kernel/fast.hpp | 38 +++--- src/backend/cpu/kernel/fftconvolve.hpp | 16 +-- src/backend/cpu/kernel/gradient.hpp | 2 + src/backend/cpu/kernel/harris.hpp | 21 +--- src/backend/cpu/kernel/histogram.hpp | 12 +- src/backend/cpu/kernel/hsv_rgb.hpp | 14 +-- src/backend/cpu/kernel/identity.hpp | 6 +- src/backend/cpu/kernel/iir.hpp | 4 +- src/backend/cpu/kernel/index.hpp | 16 +-- src/backend/cpu/kernel/lookup.hpp | 26 ++--- src/backend/cpu/utility.hpp | 35 +++++- 24 files changed, 383 insertions(+), 377 deletions(-) diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp index c751f992d9..bc3ad6e14b 100644 --- a/src/backend/cpu/bilateral.cpp +++ b/src/backend/cpu/bilateral.cpp @@ -29,7 +29,7 @@ Array bilateral(const Array &in, const float &s_sigma, const fl in.eval(); const dim4 dims = in.dims(); Array out = createEmptyArray(dims); - getQueue().enqueue(kernel::bilateral, out, in, s_sigma, c_sigma); + getQueue().enqueue(kernel::bilateral, out, in, s_sigma, c_sigma); return out; } diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 84cb0d1a54..9f6068dd65 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -63,7 +63,7 @@ Array padArray(Array const &in, dim4 const &dims, in.eval(); // FIXME: getQueue().sync(); - getQueue().enqueue(kernel::copy, ret, in, outType(default_value), factor); + getQueue().enqueue(kernel::copy, ret, in, outType(default_value), factor); return ret; } @@ -72,7 +72,7 @@ void copyArray(Array &out, Array const &in) { out.eval(); in.eval(); - getQueue().enqueue(kernel::copy, out, in, scalar(0), 1.0); + getQueue().enqueue(kernel::copy, out, in, scalar(0), 1.0); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/harris.cpp b/src/backend/cpu/harris.cpp index e5ff906dd6..905b0467c7 100644 --- a/src/backend/cpu/harris.cpp +++ b/src/backend/cpu/harris.cpp @@ -43,7 +43,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out for (unsigned i = 0; i < filter_len; i++) h_filter[i] = (T)1.f / (filter_len); } else { - kernel::gaussian1D(h_filter, (int)filter_len, sigma); + gaussian1D(h_filter, (int)filter_len, sigma); } Array filter = createDeviceDataArray(dim4(filter_len), (const void*)h_filter); diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp index 7e20247231..19314e052a 100644 --- a/src/backend/cpu/histogram.cpp +++ b/src/backend/cpu/histogram.cpp @@ -32,7 +32,7 @@ Array histogram(const Array &in, Array out = createValueArray(outDims, outType(0)); out.eval(); - getQueue().enqueue(kernel::histogram, + getQueue().enqueue(kernel::histogram, out, in, nbins, minval, maxval); return out; diff --git a/src/backend/cpu/kernel/Array.hpp b/src/backend/cpu/kernel/Array.hpp index e492b92ff0..08ade502e5 100644 --- a/src/backend/cpu/kernel/Array.hpp +++ b/src/backend/cpu/kernel/Array.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include @@ -15,16 +17,14 @@ namespace cpu namespace kernel { -using af::dim4; - template void evalArray(Array in) { in.setId(cpu::getActiveDeviceId()); T *ptr = in.data.get(); - dim4 odims = in.dims(); - dim4 ostrs = in.strides(); + af::dim4 odims = in.dims(); + af::dim4 ostrs = in.strides(); bool is_linear = in.node->isLinear(odims.get()); diff --git a/src/backend/cpu/kernel/approx1.hpp b/src/backend/cpu/kernel/approx1.hpp index 51c48048c1..ab12ebc813 100644 --- a/src/backend/cpu/kernel/approx1.hpp +++ b/src/backend/cpu/kernel/approx1.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include @@ -15,115 +17,115 @@ namespace cpu namespace kernel { -using af::dim4; - -template +template struct approx1_op { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, + af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) { return; } }; -template -struct approx1_op +template +struct approx1_op { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, + af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) { dim_t pmId = idx; if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - const Tp x = pos[pmId]; + LocT const x = pos[pmId]; bool gFlag = false; if (x < 0 || idims[0] < x+1) { // No need to check y gFlag = true; } - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + dim_t const omId = idw * ostrides[3] + idz * ostrides[2] + idy * ostrides[1] + idx; if(gFlag) { - out[omId] = scalar(offGrid); + out[omId] = scalar(offGrid); } else { dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1]; - const dim_t iMem = round(x) + ioff; + dim_t const iMem = round(x) + ioff; out[omId] = in[iMem]; } } }; -template -struct approx1_op +template +struct approx1_op { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, - const af::dim4 &ostrides, const af::dim4 &istrides, const af::dim4 &pstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, + af::dim4 const & ostrides, af::dim4 const & istrides, af::dim4 const & pstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) { dim_t pmId = idx; if(pBatch) pmId += idw * pstrides[3] + idz * pstrides[2] + idy * pstrides[1]; - const Tp x = pos[pmId]; + LocT const x = pos[pmId]; bool gFlag = false; if (x < 0 || idims[0] < x+1) { gFlag = true; } - const dim_t grid_x = floor(x); // nearest grid - const Tp off_x = x - grid_x; // fractional offset + dim_t const grid_x = floor(x); // nearest grid + LocT const off_x = x - grid_x; // fractional offset - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + dim_t const omId = idw * ostrides[3] + idz * ostrides[2] + idy * ostrides[1] + idx; if(gFlag) { - out[omId] = scalar(offGrid); + out[omId] = scalar(offGrid); } else { dim_t ioff = idw * istrides[3] + idz * istrides[2] + idy * istrides[1] + grid_x; // Check if x and x + 1 are both valid indices bool cond = (x < idims[0] - 1); // Compute Left and Right Weighted Values - Ty yl = ((Tp)1.0 - off_x) * in[ioff]; - Ty yr = cond ? (off_x) * in[ioff + 1] : scalar(0); - Ty yo = yl + yr; + InT yl = ((LocT)1.0 - off_x) * in[ioff]; + InT yr = cond ? (off_x) * in[ioff + 1] : scalar(0); + InT yo = yl + yr; // Compute Weight used - Tp wt = cond ? (Tp)1.0 : (Tp)(1.0 - off_x); + LocT wt = cond ? (LocT)1.0 : (LocT)(1.0 - off_x); // Write final value out[omId] = (yo / wt); } } }; -template -void approx1(Array output, Array const input, - Array const position, float const offGrid) +template +void approx1(Array output, Array const input, + Array const position, float const offGrid) { - Ty * out = output.get(); - Ty const * const in = input.get(); - Tp const * const pos = position.get(); - dim4 const odims = output.dims(); - dim4 const idims = input.dims(); - dim4 const pdims = position.dims(); - dim4 const ostrides = output.strides(); - dim4 const istrides = input.strides(); - dim4 const pstrides = position.strides(); - dim_t const oElems = output.elements(); - dim_t const iElems = input.elements(); - - approx1_op op; + InT * out = output.get(); + InT const * const in = input.get(); + LocT const * const pos = position.get(); + + af::dim4 const odims = output.dims(); + af::dim4 const idims = input.dims(); + af::dim4 const pdims = position.dims(); + af::dim4 const ostrides = output.strides(); + af::dim4 const istrides = input.strides(); + af::dim4 const pstrides = position.strides(); + + dim_t const oElems = output.elements(); + dim_t const iElems = input.elements(); + + approx1_op op; bool pBatch = !(pdims[1] == 1 && pdims[2] == 1 && pdims[3] == 1); for(dim_t w = 0; w < odims[3]; w++) { diff --git a/src/backend/cpu/kernel/approx2.hpp b/src/backend/cpu/kernel/approx2.hpp index f80dae17bb..b5115e2e49 100644 --- a/src/backend/cpu/kernel/approx2.hpp +++ b/src/backend/cpu/kernel/approx2.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include @@ -15,33 +17,31 @@ namespace cpu namespace kernel { -using af::dim4; - -template +template struct approx2_op { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims, + af::dim4 const & ostrides, af::dim4 const & istrides, + af::dim4 const & pstrides, af::dim4 const & qstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) { return; } }; -template -struct approx2_op +template +struct approx2_op { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims, + af::dim4 const & ostrides, af::dim4 const & istrides, + af::dim4 const & pstrides, af::dim4 const & qstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) { dim_t pmId = idy * pstrides[1] + idx; dim_t qmId = idy * qstrides[1] + idx; @@ -51,34 +51,34 @@ struct approx2_op } bool gFlag = false; - const Tp x = pos[pmId], y = qos[qmId]; + LocT const x = pos[pmId], y = qos[qmId]; if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { gFlag = true; } - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + dim_t const omId = idw * ostrides[3] + idz * ostrides[2] + idy * ostrides[1] + idx; if(gFlag) { - out[omId] = scalar(offGrid); + out[omId] = scalar(offGrid); } else { - const dim_t grid_x = round(x), grid_y = round(y); // nearest grid - const dim_t imId = idw * istrides[3] + idz * istrides[2] + + dim_t const grid_x = round(x), grid_y = round(y); // nearest grid + dim_t const imId = idw * istrides[3] + idz * istrides[2] + grid_y * istrides[1] + grid_x; out[omId] = in[imId]; } } }; -template -struct approx2_op +template +struct approx2_op { - void operator()(Ty *out, const af::dim4 &odims, const dim_t oElems, - const Ty *in, const af::dim4 &idims, const dim_t iElems, - const Tp *pos, const af::dim4 &pdims, const Tp *qos, const af::dim4 &qdims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &pstrides, const af::dim4 &qstrides, - const float offGrid, const bool pBatch, - const dim_t idx, const dim_t idy, const dim_t idz, const dim_t idw) + void operator()(InT *out, af::dim4 const & odims, dim_t const oElems, + InT const * const in, af::dim4 const & idims, dim_t const iElems, + LocT const * const pos, af::dim4 const & pdims, LocT const * const qos, af::dim4 const & qdims, + af::dim4 const & ostrides, af::dim4 const & istrides, + af::dim4 const & pstrides, af::dim4 const & qstrides, + float const offGrid, bool const pBatch, + dim_t const idx, dim_t const idy, dim_t const idz, dim_t const idw) { dim_t pmId = idy * pstrides[1] + idx; dim_t qmId = idy * qstrides[1] + idx; @@ -88,42 +88,42 @@ struct approx2_op } bool gFlag = false; - const Tp x = pos[pmId], y = qos[qmId]; + LocT const x = pos[pmId], y = qos[qmId]; if (x < 0 || y < 0 || idims[0] < x+1 || idims[1] < y+1) { gFlag = true; } - const dim_t grid_x = floor(x), grid_y = floor(y); // nearest grid - const Tp off_x = x - grid_x, off_y = y - grid_y; // fractional offset + dim_t const grid_x = floor(x), grid_y = floor(y); // nearest grid + LocT const off_x = x - grid_x, off_y = y - grid_y; // fractional offset // Check if pVal and pVal + 1 are both valid indices bool condY = (y < idims[1] - 1); bool condX = (x < idims[0] - 1); // Compute wieghts used - Tp wt00 = ((Tp)1.0 - off_x) * ((Tp)1.0 - off_y); - Tp wt10 = (condY) ? ((Tp)1.0 - off_x) * (off_y) : 0; - Tp wt01 = (condX) ? (off_x) * ((Tp)1.0 - off_y) : 0; - Tp wt11 = (condX && condY) ? (off_x) * (off_y) : 0; + LocT wt00 = ((LocT)1.0 - off_x) * ((LocT)1.0 - off_y); + LocT wt10 = (condY) ? ((LocT)1.0 - off_x) * (off_y) : 0; + LocT wt01 = (condX) ? (off_x) * ((LocT)1.0 - off_y) : 0; + LocT wt11 = (condX && condY) ? (off_x) * (off_y) : 0; - Tp wt = wt00 + wt10 + wt01 + wt11; - Ty zero = scalar(0); + LocT wt = wt00 + wt10 + wt01 + wt11; + InT zero = scalar(0); - const dim_t omId = idw * ostrides[3] + idz * ostrides[2] + dim_t const omId = idw * ostrides[3] + idz * ostrides[2] + idy * ostrides[1] + idx; if(gFlag) { - out[omId] = scalar(offGrid); + out[omId] = scalar(offGrid); } else { dim_t ioff = idw * istrides[3] + idz * istrides[2] + grid_y * istrides[1] + grid_x; // Compute Weighted Values - Ty y00 = wt00 * in[ioff]; - Ty y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero; - Ty y01 = (condX) ? wt01 * in[ioff + 1] : zero; - Ty y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero; + InT y00 = wt00 * in[ioff]; + InT y10 = (condY) ? wt10 * in[ioff + istrides[1]] : zero; + InT y01 = (condX) ? wt01 * in[ioff + 1] : zero; + InT y11 = (condX && condY) ? wt11 * in[ioff + istrides[1] + 1] : zero; - Ty yo = y00 + y10 + y01 + y11; + InT yo = y00 + y10 + y01 + y11; // Write Final Value out[omId] = (yo / wt); @@ -131,27 +131,27 @@ struct approx2_op } }; -template -void approx2(Array output, Array const input, - Array const position, Array const qosition, +template +void approx2(Array output, Array const input, + Array const position, Array const qosition, float const offGrid) { - Ty * out = output.get(); - Ty const * const in = input.get(); - Tp const * const pos = position.get(); - Tp const * const qos = qosition.get(); - dim4 const odims = output.dims(); - dim4 const idims = input.dims(); - dim4 const pdims = position.dims(); - dim4 const qdims = qosition.dims(); - dim4 const ostrides = output.strides(); - dim4 const istrides = input.strides(); - dim4 const pstrides = position.strides(); - dim4 const qstrides = qosition.strides(); + InT * out = output.get(); + InT const * const in = input.get(); + LocT const * const pos = position.get(); + LocT const * const qos = qosition.get(); + af::dim4 const odims = output.dims(); + af::dim4 const idims = input.dims(); + af::dim4 const pdims = position.dims(); + af::dim4 const qdims = qosition.dims(); + af::dim4 const ostrides = output.strides(); + af::dim4 const istrides = input.strides(); + af::dim4 const pstrides = position.strides(); + af::dim4 const qstrides = qosition.strides(); dim_t const oElems = output.elements(); dim_t const iElems = input.elements(); - approx2_op op; + approx2_op op; bool pBatch = !(pdims[2] == 1 && pdims[3] == 1); for(dim_t w = 0; w < odims[3]; w++) { diff --git a/src/backend/cpu/kernel/assign.hpp b/src/backend/cpu/kernel/assign.hpp index 83f48e9f75..86befaf74e 100644 --- a/src/backend/cpu/kernel/assign.hpp +++ b/src/backend/cpu/kernel/assign.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include #include @@ -16,25 +18,23 @@ namespace cpu namespace kernel { -using af::dim4; - template -void assign(Array out, const Array rhs, const std::vector isSeq, - const std::vector seqs, const std::vector< Array > idxArrs) +void assign(Array out, Array const rhs, std::vector const isSeq, + std::vector const seqs, std::vector< Array > const idxArrs) { - dim4 dDims = out.getDataDims(); - dim4 pDims = out.dims(); + af::dim4 dDims = out.getDataDims(); + af::dim4 pDims = out.dims(); // retrieve dimensions & strides for array to which rhs is being copied to - dim4 dst_offsets = toOffset(seqs, dDims); - dim4 dst_strides = toStride(seqs, dDims); + af::dim4 dst_offsets = toOffset(seqs, dDims); + af::dim4 dst_strides = toStride(seqs, dDims); // retrieve rhs array dimenesions & strides - dim4 src_dims = rhs.dims(); - dim4 src_strides = rhs.strides(); + af::dim4 src_dims = rhs.dims(); + af::dim4 src_strides = rhs.strides(); // declare pointers to af_array index data - const uint* ptr0 = idxArrs[0].get(); - const uint* ptr1 = idxArrs[1].get(); - const uint* ptr2 = idxArrs[2].get(); - const uint* ptr3 = idxArrs[3].get(); + uint const * const ptr0 = idxArrs[0].get(); + uint const * const ptr1 = idxArrs[1].get(); + uint const * const ptr2 = idxArrs[2].get(); + uint const * const ptr3 = idxArrs[3].get(); const T * src= rhs.get(); T * dst = out.get(); diff --git a/src/backend/cpu/kernel/bilateral.hpp b/src/backend/cpu/kernel/bilateral.hpp index 2b5764fd37..c950bbd084 100644 --- a/src/backend/cpu/kernel/bilateral.hpp +++ b/src/backend/cpu/kernel/bilateral.hpp @@ -7,42 +7,33 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include +#include +#include namespace cpu { namespace kernel { -inline -dim_t clamp(int a, dim_t mn, dim_t mx) +template +void bilateral(Array out, Array const in, float const s_sigma, float const c_sigma) { - return (a < (int)mn ? mn : (a > (int)mx ? mx : a)); -} - -inline -unsigned getIdx(const dim4 &strides, int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + k * strides[2] + j * strides[1] + i * strides[0]); -} - -template -void bilateral(Array out, const Array in, float s_sigma, float c_sigma) -{ - const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - - const dim4 ostrides = out.strides(); + af::dim4 const dims = in.dims(); + af::dim4 const istrides = in.strides(); + af::dim4 const ostrides = out.strides(); - outType *outData = out.get(); - const inType *inData = in.get(); + OutT *outData = out.get(); + InT const * inData = in.get(); // clamp spatical and chromatic sigma's - float space_ = std::min(11.5f, std::max(s_sigma, 0.f)); - float color_ = std::max(c_sigma, 0.f); - const dim_t radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1); - const float svar = space_*space_; - const float cvar = color_*color_; + float space_ = std::min(11.5f, std::max(s_sigma, 0.f)); + float color_ = std::max(c_sigma, 0.f); + dim_t const radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1); + float const svar = space_*space_; + float const cvar = color_*color_; for(dim_t b3=0; b3 out, const Array in, float s_sigma, float // j steps along 2nd dimension for(dim_t i=0; i out, const Array in, float s_sigma, float // clamps offsets dim_t ti = clamp(i+wi, 0, dims[0]-1); // proceed - const outType val= (outType)inData[getIdx(istrides, ti, tj)]; - const outType gauss_space = (wi*wi+wj*wj)/(-2.0*svar); - const outType gauss_range = ((center-val)*(center-val))/(-2.0*cvar); - const outType weight = std::exp(gauss_space+gauss_range); + OutT const val= (OutT)inData[getIdx(istrides, ti, tj)]; + OutT const gauss_space = (wi*wi+wj*wj)/(-2.0*svar); + OutT const gauss_range = ((center-val)*(center-val))/(-2.0*cvar); + OutT const weight = std::exp(gauss_space+gauss_range); norm += weight; res += val*weight; } diff --git a/src/backend/cpu/kernel/convolve.hpp b/src/backend/cpu/kernel/convolve.hpp index d39acb65c9..79d684dd64 100644 --- a/src/backend/cpu/kernel/convolve.hpp +++ b/src/backend/cpu/kernel/convolve.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include namespace cpu @@ -14,41 +16,39 @@ namespace cpu namespace kernel { -using af::dim4; - -template -void one2one_1d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &sStrides) +template +void one2one_1d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims, + af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & sStrides) { - dim_t start = (expand ? 0 : fDims[0]/2); - dim_t end = (expand ? oDims[0] : start + sDims[0]); + dim_t start = (Expand ? 0 : fDims[0]/2); + dim_t end = (Expand ? oDims[0] : start + sDims[0]); for(dim_t i=start; i=0 &&iIdx=0 &&iIdx -void one2one_2d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides, - dim4 const &sStrides, dim4 const &fStrides) +template +void one2one_2d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims, + af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & oStrides, + af::dim4 const & sStrides, af::dim4 const & fStrides) { - dim_t jStart = (expand ? 0 : fDims[1]/2); - dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]); - dim_t iStart = (expand ? 0 : fDims[0]/2); - dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]); + dim_t jStart = (Expand ? 0 : fDims[1]/2); + dim_t jEnd = (Expand ? oDims[1] : jStart + sDims[1]); + dim_t iStart = (Expand ? 0 : fDims[0]/2); + dim_t iEnd = (Expand ? oDims[0] : iStart + sDims[0]); for(dim_t j=jStart; j=0 && iIdx -void one2one_3d(T *optr, T const *iptr, accT const *fptr, dim4 const &oDims, - dim4 const &sDims, dim4 const &fDims, dim4 const &oStrides, - dim4 const &sStrides, dim4 const &fStrides) +template +void one2one_3d(InT *optr, InT const * const iptr, AccT const * const fptr, af::dim4 const & oDims, + af::dim4 const & sDims, af::dim4 const & fDims, af::dim4 const & oStrides, + af::dim4 const & sStrides, af::dim4 const & fStrides) { - dim_t kStart = (expand ? 0 : fDims[2]/2); - dim_t kEnd = (expand ? oDims[2] : kStart + sDims[2]); - dim_t jStart = (expand ? 0 : fDims[1]/2); - dim_t jEnd = (expand ? oDims[1] : jStart + sDims[1]); - dim_t iStart = (expand ? 0 : fDims[0]/2); - dim_t iEnd = (expand ? oDims[0] : iStart + sDims[0]); + dim_t kStart = (Expand ? 0 : fDims[2]/2); + dim_t kEnd = (Expand ? oDims[2] : kStart + sDims[2]); + dim_t jStart = (Expand ? 0 : fDims[1]/2); + dim_t jEnd = (Expand ? oDims[1] : jStart + sDims[1]); + dim_t iStart = (Expand ? 0 : fDims[0]/2); + dim_t iEnd = (Expand ? oDims[0] : iStart + sDims[0]); for(dim_t k=kStart; k=0 && iIdx -void convolve_nd(Array out, Array const signal, Array const filter, ConvolveBatchKind kind) +template +void convolve_nd(Array out, Array const signal, Array const filter, ConvolveBatchKind kind) { - T * optr = out.get(); - T const * const iptr = signal.get(); - accT const * const fptr = filter.get(); + InT * optr = out.get(); + InT const * const iptr = signal.get(); + AccT const * const fptr = filter.get(); - dim4 const oDims = out.dims(); - dim4 const sDims = signal.dims(); - dim4 const fDims = filter.dims(); + af::dim4 const oDims = out.dims(); + af::dim4 const sDims = signal.dims(); + af::dim4 const fDims = filter.dims(); - dim4 const oStrides = out.strides(); - dim4 const sStrides = signal.strides(); - dim4 const fStrides = filter.strides(); + af::dim4 const oStrides = out.strides(); + af::dim4 const sStrides = signal.strides(); + af::dim4 const fStrides = filter.strides(); dim_t out_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ dim_t in_step[4] = {0, 0, 0, 0}; /* first value is never used, and declared for code simplicity */ @@ -169,66 +169,66 @@ void convolve_nd(Array out, Array const signal, Array const filter, for (dim_t b2=0; b2(out, in, filt, oDims, sDims, fDims, sStrides); break; - case 2: one2one_2d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; - case 3: one2one_3d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; + case 1: one2one_1d(out, in, filt, oDims, sDims, fDims, sStrides); break; + case 2: one2one_2d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; + case 3: one2one_3d(out, in, filt, oDims, sDims, fDims, oStrides, sStrides, fStrides); break; } } } } } -template -void convolve2_separable(T *optr, T const *iptr, accT const *fptr, - dim4 const &oDims, dim4 const &sDims, dim4 const &orgDims, dim_t fDim, - dim4 const &oStrides, dim4 const &sStrides, dim_t fStride) +template +void convolve2_separable(InT *optr, InT const * const iptr, AccT const * const fptr, + af::dim4 const & oDims, af::dim4 const & sDims, af::dim4 const & orgDims, dim_t fDim, + af::dim4 const & oStrides, af::dim4 const & sStrides, dim_t fStride) { for(dim_t j=0; j>1); + dim_t cj = j + (conv_dim==1)*(Expand ? 0: fDim>>1); for(dim_t i=0; i>1); + dim_t ci = i + (conv_dim==0)*(Expand ? 0 : fDim>>1); - accT accum = scalar(0); + AccT accum = scalar(0); for(dim_t f=0; f=0 && offi=0 && cj(0)); + s_val = (isCJValid && isCIValid ? iptr[cj*sDims[0]+offi] : scalar(0)); } else { dim_t offj = cj - f; bool isCIValid = ci>=0 && ci=0 && offj(0)); + s_val = (isCJValid && isCIValid ? iptr[offj*sDims[0]+ci] : scalar(0)); } - accum += accT(s_val * f_val); + accum += AccT(s_val * f_val); } - optr[iOff+jOff] = T(accum); + optr[iOff+jOff] = InT(accum); } } } -template -void convolve2(Array out, Array const signal, - Array const c_filter, Array const r_filter, - dim4 const tDims) +template +void convolve2(Array out, Array const signal, + Array const c_filter, Array const r_filter, + af::dim4 const tDims) { - Array temp = createEmptyArray(tDims); + Array temp = createEmptyArray(tDims); dim_t cflen = (dim_t)c_filter.elements(); dim_t rflen = (dim_t)r_filter.elements(); @@ -248,15 +248,15 @@ void convolve2(Array out, Array const signal, for (dim_t b2=0; b2(tptr, iptr, c_filter.get(), + convolve2_separable(tptr, iptr, c_filter.get(), tDims, sDims, sDims, cflen, tStrides, sStrides, c_filter.strides()[0]); - convolve2_separable(optr, tptr, r_filter.get(), + convolve2_separable(optr, tptr, r_filter.get(), oDims, tDims, sDims, rflen, oStrides, tStrides, r_filter.strides()[0]); } diff --git a/src/backend/cpu/kernel/copy.hpp b/src/backend/cpu/kernel/copy.hpp index 063fb29f0c..70d6705ec2 100644 --- a/src/backend/cpu/kernel/copy.hpp +++ b/src/backend/cpu/kernel/copy.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include namespace cpu @@ -14,11 +16,9 @@ namespace cpu namespace kernel { -using af::dim4; - template -void stridedCopy(T* dst, const dim4& ostrides, const T* src, - const dim4 &dims, const dim4 &strides, unsigned dim) +void stridedCopy(T* dst, af::dim4 const & ostrides, T const * src, + af::dim4 const & dims, af::dim4 const & strides, unsigned dim) { if(dim == 0) { if(strides[dim] == 1) { @@ -38,16 +38,16 @@ void stridedCopy(T* dst, const dim4& ostrides, const T* src, } } -template -void copy(Array dst, const Array src, outType default_value, double factor) +template +void copy(Array dst, Array const src, OutT default_value, double factor) { - dim4 src_dims = src.dims(); - dim4 dst_dims = dst.dims(); - dim4 src_strides = src.strides(); - dim4 dst_strides = dst.strides(); + af::dim4 src_dims = src.dims(); + af::dim4 dst_dims = dst.dims(); + af::dim4 src_strides = src.strides(); + af::dim4 dst_strides = dst.strides(); - const inType * src_ptr = src.get(); - outType * dst_ptr = dst.get(); + InT const * const src_ptr = src.get(); + OutT * dst_ptr = dst.get(); dim_t trgt_l = std::min(dst_dims[3], src_dims[3]); dim_t trgt_k = std::min(dst_dims[2], src_dims[2]); @@ -73,10 +73,10 @@ void copy(Array dst, const Array src, outType default_value, do bool isJvalid = j #include namespace cpu @@ -14,16 +16,14 @@ namespace cpu namespace kernel { -using af::dim4; - template void diagCreate(Array out, Array const in, int const num) { int batch = in.dims()[1]; int size = out.dims()[0]; - const T *iptr = in.get(); - T *optr = out.get(); + T const * iptr = in.get(); + T * optr = out.get(); for (int k = 0; k < batch; k++) { for (int j = 0; j < size; j++) { @@ -43,10 +43,10 @@ void diagCreate(Array out, Array const in, int const num) template void diagExtract(Array out, Array const in, int const num) { - const dim4 odims = out.dims(); - const dim4 idims = in.dims(); + dim4 const odims = out.dims(); + dim4 const idims = in.dims(); - const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); + int const i_off = (num > 0) ? (num * in.strides()[1]) : (-num); for (int l = 0; l < (int)odims[3]; l++) { diff --git a/src/backend/cpu/kernel/diff.hpp b/src/backend/cpu/kernel/diff.hpp index e0693b1349..1a3d7ba110 100644 --- a/src/backend/cpu/kernel/diff.hpp +++ b/src/backend/cpu/kernel/diff.hpp @@ -7,19 +7,16 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include +#include namespace cpu { namespace kernel { -unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + k * strides[2] + j * strides[1] + i); -} - - template void diff1(Array out, Array const in, int const dim) { @@ -30,9 +27,8 @@ void diff1(Array out, Array const in, int const dim) bool is_dim2 = dim == 2; bool is_dim3 = dim == 3; - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = out.get(); + T const * const inPtr = in.get(); + T * outPtr = out.get(); // TODO: Improve this for(dim_t l = 0; l < dims[3]; l++) { @@ -40,11 +36,11 @@ void diff1(Array out, Array const in, int const dim) for(dim_t j = 0; j < dims[1]; j++) { for(dim_t i = 0; i < dims[0]; i++) { // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), + int idx = getIdx(in.strides(), i, j, k, l); + int jdx = getIdx(in.strides(), i + is_dim0, j + is_dim1, k + is_dim2, l + is_dim3); - int odx = getIdx(out.strides(), out.offsets(), i, j, k, l); + int odx = getIdx(out.strides(), i, j, k, l); outPtr[odx] = inPtr[jdx] - inPtr[idx]; } } @@ -62,9 +58,8 @@ void diff2(Array out, Array const in, int const dim) bool is_dim2 = dim == 2; bool is_dim3 = dim == 3; - // Get pointers to raw data - const T *inPtr = in.get(); - T *outPtr = out.get(); + T const * const inPtr = in.get(); + T * outPtr = out.get(); // TODO: Improve this for(dim_t l = 0; l < dims[3]; l++) { @@ -72,14 +67,14 @@ void diff2(Array out, Array const in, int const dim) for(dim_t j = 0; j < dims[1]; j++) { for(dim_t i = 0; i < dims[0]; i++) { // Operation: out[index] = in[index + 1 * dim_size] - in[index] - int idx = getIdx(in.strides(), in.offsets(), i, j, k, l); - int jdx = getIdx(in.strides(), in.offsets(), + int idx = getIdx(in.strides(), i, j, k, l); + int jdx = getIdx(in.strides(), i + is_dim0, j + is_dim1, k + is_dim2, l + is_dim3); - int kdx = getIdx(in.strides(), in.offsets(), + int kdx = getIdx(in.strides(), i + 2 * is_dim0, j + 2 * is_dim1, k + 2 * is_dim2, l + 2 * is_dim3); - int odx = getIdx(out.strides(), out.offsets(), i, j, k, l); + int odx = getIdx(out.strides(), i, j, k, l); outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; } } diff --git a/src/backend/cpu/kernel/fast.hpp b/src/backend/cpu/kernel/fast.hpp index a3971dd136..02da3e4d33 100644 --- a/src/backend/cpu/kernel/fast.hpp +++ b/src/backend/cpu/kernel/fast.hpp @@ -7,20 +7,16 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include +#include namespace cpu { namespace kernel { -using af::dim4; - -inline int clamp(int f, int a, int b) -{ - return std::max(a, std::min(f, b)); -} - inline int idx_y(int i) { if (i >= 8) @@ -86,14 +82,14 @@ inline double abs_diff(double x, double y) } template -void locate_features(const Array &in, Array &score, - Array &x_out, Array &y_out, - Array &score_out, unsigned* count, const float thr, - const unsigned arc_length, const unsigned nonmax, - const unsigned max_feat, const unsigned edge) +void locate_features(Array const & in, Array & score, + Array & x_out, Array & y_out, + Array & score_out, unsigned* count, float const thr, + unsigned const arc_length, unsigned const nonmax, + unsigned const max_feat, unsigned const edge) { - dim4 in_dims = in.dims(); - const T* in_ptr = in.get(); + af::dim4 in_dims = in.dims(); + T const * in_ptr = in.get(); for (int y = edge; y < (int)(in_dims[0] - edge); y++) { for (int x = edge; x < (int)(in_dims[1] - edge); x++) { @@ -179,15 +175,15 @@ void locate_features(const Array &in, Array &score, } } -void non_maximal(const Array &score, const Array &x_in, const Array &y_in, - Array &x_out, Array &y_out, Array &score_out, - unsigned* count, const unsigned total_feat, const unsigned edge) +void non_maximal(Array const & score, const Array & x_in, const Array & y_in, + Array & x_out, Array & y_out, Array & score_out, + unsigned* count, unsigned const total_feat, unsigned const edge) { - const float *score_ptr = score.get(); - const float *x_in_ptr = x_in.get(); - const float *y_in_ptr = y_in.get(); + float const * score_ptr = score.get(); + float const * x_in_ptr = x_in.get(); + float const * y_in_ptr = y_in.get(); - dim4 score_dims = score.dims(); + af::dim4 score_dims = score.dims(); for (unsigned k = 0; k < total_feat; k++) { unsigned x = static_cast(round(x_in_ptr[k])); diff --git a/src/backend/cpu/kernel/fftconvolve.hpp b/src/backend/cpu/kernel/fftconvolve.hpp index 30bac668f1..6213cb2730 100644 --- a/src/backend/cpu/kernel/fftconvolve.hpp +++ b/src/backend/cpu/kernel/fftconvolve.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include @@ -15,8 +17,6 @@ namespace cpu namespace kernel { -using af::dim4; - template void packData(Array out, const af::dim4 od, const af::dim4 os, Array const in) { @@ -95,12 +95,12 @@ void complexMultiply(Array packed, const af::dim4 sig_dims, const af::dim4 si T* in1_ptr = packed.get(); T* in2_ptr = packed.get() + offset; - const dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims); - const dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides); - const dim4& i1d = sig_dims; - const dim4& i2d = fit_dims; - const dim4& i1s = sig_strides; - const dim4& i2s = fit_strides; + const af::dim4& od = (kind==CONVOLVE_BATCH_KERNEL ? fit_dims : sig_dims); + const af::dim4& os = (kind==CONVOLVE_BATCH_KERNEL ? fit_strides : sig_strides); + const af::dim4& i1d = sig_dims; + const af::dim4& i2d = fit_dims; + const af::dim4& i1s = sig_strides; + const af::dim4& i2s = fit_strides; for (int d3 = 0; d3 < (int)od[3]; d3++) { for (int d2 = 0; d2 < (int)od[2]; d2++) { diff --git a/src/backend/cpu/kernel/gradient.hpp b/src/backend/cpu/kernel/gradient.hpp index c152fb343a..1ab01abb0f 100644 --- a/src/backend/cpu/kernel/gradient.hpp +++ b/src/backend/cpu/kernel/gradient.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include namespace cpu diff --git a/src/backend/cpu/kernel/harris.hpp b/src/backend/cpu/kernel/harris.hpp index db6551bbde..183cf37e77 100644 --- a/src/backend/cpu/kernel/harris.hpp +++ b/src/backend/cpu/kernel/harris.hpp @@ -7,31 +7,16 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include +#include namespace cpu { namespace kernel { -template -void gaussian1D(T* out, const int dim, double sigma=0.0) -{ - if(!(sigma>0)) sigma = 0.25*dim; - - T sum = (T)0; - for(int i=0;i void second_order_deriv(Array ixx, Array ixy, Array iyy, const unsigned in_len, const Array ix, const Array iy) diff --git a/src/backend/cpu/kernel/histogram.hpp b/src/backend/cpu/kernel/histogram.hpp index e26965aa04..9b9b897c02 100644 --- a/src/backend/cpu/kernel/histogram.hpp +++ b/src/backend/cpu/kernel/histogram.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include namespace cpu @@ -14,8 +16,8 @@ namespace cpu namespace kernel { -template -void histogram(Array out, Array const in, +template +void histogram(Array out, Array const in, unsigned const nbins, double const minval, double const maxval) { dim4 const outDims = out.dims(); @@ -25,13 +27,13 @@ void histogram(Array out, Array const in, dim4 const oStrides = out.strides(); dim_t const nElems = inDims[0]*inDims[1]; - outType *outData = out.get(); - const inType* inData= in.get(); + OutT *outData = out.get(); + const InT* inData= in.get(); for(dim_t b3 = 0; b3 < outDims[3]; b3++) { for(dim_t b2 = 0; b2 < outDims[2]; b2++) { for(dim_t i=0; i #include #include @@ -15,13 +17,11 @@ namespace cpu namespace kernel { -using af::dim4; - template void hsv2rgb(Array out, Array const in) { - const dim4 dims = in.dims(); - const dim4 strides = in.strides(); + const af::dim4 dims = in.dims(); + const af::dim4 strides = in.strides(); dim_t obStride = out.strides()[3]; dim_t coff = strides[2]; dim_t bCount = dims[3]; @@ -72,9 +72,9 @@ void hsv2rgb(Array out, Array const in) template void rgb2hsv(Array out, Array const in) { - const dim4 dims = in.dims(); - const dim4 strides = in.strides(); - dim4 oStrides = out.strides(); + const af::dim4 dims = in.dims(); + const af::dim4 strides = in.strides(); + af::dim4 oStrides = out.strides(); dim_t bCount = dims[3]; for(dim_t b=0; b #include #include @@ -15,13 +17,11 @@ namespace cpu namespace kernel { -using af::dim4; - template void identity(Array out) { T *ptr = out.get(); - const dim4 out_dims = out.dims(); + const af::dim4 out_dims = out.dims(); for (dim_t k = 0; k < out_dims[2] * out_dims[3]; k++) { for (dim_t j = 0; j < out_dims[1]; j++) { diff --git a/src/backend/cpu/kernel/iir.hpp b/src/backend/cpu/kernel/iir.hpp index d1ca464365..5182094fc2 100644 --- a/src/backend/cpu/kernel/iir.hpp +++ b/src/backend/cpu/kernel/iir.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include namespace cpu @@ -14,8 +16,6 @@ namespace cpu namespace kernel { -using af::dim4; - template void iir(Array y, Array c, Array const a) { diff --git a/src/backend/cpu/kernel/index.hpp b/src/backend/cpu/kernel/index.hpp index ee20c24d44..343d7ae4e7 100644 --- a/src/backend/cpu/kernel/index.hpp +++ b/src/backend/cpu/kernel/index.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include #include @@ -16,19 +18,17 @@ namespace cpu namespace kernel { -using af::dim4; - template void index(Array out, Array const in, std::vector const isSeq, std::vector const seqs, std::vector< Array > const idxArrs) { - const dim4 iDims = in.dims(); - const dim4 dDims = in.getDataDims(); - const dim4 iOffs = toOffset(seqs, dDims); - const dim4 iStrds = toStride(seqs, dDims); - const dim4 oDims = out.dims(); - const dim4 oStrides = out.strides(); + const af::dim4 iDims = in.dims(); + const af::dim4 dDims = in.getDataDims(); + const af::dim4 iOffs = toOffset(seqs, dDims); + const af::dim4 iStrds = toStride(seqs, dDims); + const af::dim4 oDims = out.dims(); + const af::dim4 oStrides = out.strides(); const T *src = in.get(); T *dst = out.get(); const uint* ptr0 = idxArrs[0].get(); diff --git a/src/backend/cpu/kernel/lookup.hpp b/src/backend/cpu/kernel/lookup.hpp index 551cd2fd03..a290ef2fca 100644 --- a/src/backend/cpu/kernel/lookup.hpp +++ b/src/backend/cpu/kernel/lookup.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include #include @@ -16,20 +18,18 @@ namespace cpu namespace kernel { -using af::dim4; - -template -void lookup(Array out, Array const input, - Array const indices, unsigned const dim) +template +void lookup(Array out, Array const input, + Array const indices, unsigned const dim) { - const dim4 iDims = input.dims(); - const dim4 oDims = out.dims(); - const dim4 iStrides = input.strides(); - const dim4 oStrides = out.strides(); - const in_t *inPtr = input.get(); - const idx_t *idxPtr = indices.get(); - - in_t *outPtr = out.get(); + const af::dim4 iDims = input.dims(); + const af::dim4 oDims = out.dims(); + const af::dim4 iStrides = input.strides(); + const af::dim4 oStrides = out.strides(); + const InT *inPtr = input.get(); + const IndexT *idxPtr = indices.get(); + + InT *outPtr = out.get(); for (dim_t l=0; l +#include +#include +#include #include "backend.hpp" namespace cpu { static inline -dim_t trimIndex(const int &idx, const dim_t &len) +dim_t trimIndex(int const & idx, dim_t const & len) { int ret_val = idx; int offset = abs(ret_val)%len; @@ -27,4 +30,34 @@ dim_t trimIndex(const int &idx, const dim_t &len) return ret_val; } +static inline +dim_t clamp(int a, dim_t mn, dim_t mx) +{ + return (a < (int)mn ? mn : (a > (int)mx ? mx : a)); +} + +static inline +unsigned getIdx(af::dim4 const & strides, int i, int j = 0, int k = 0, int l = 0) +{ + return (l * strides[3] + k * strides[2] + j * strides[1] + i * strides[0]); +} + +template +void gaussian1D(T* out, int const dim, double sigma=0.0) +{ + if(!(sigma>0)) sigma = 0.25*dim; + + T sum = (T)0; + for(int i=0;i Date: Sat, 19 Dec 2015 14:06:37 -0500 Subject: [PATCH 086/288] Moved more cpu fns implementations to kernel namespace Below given is the list of functions that have undergone this change: * iota * ireduce * join * lu decomposition * template matching * mean shift * median filter * morphological operations --- src/backend/cpu/iota.cpp | 38 +----- src/backend/cpu/ireduce.cpp | 100 +-------------- src/backend/cpu/join.cpp | 148 ++-------------------- src/backend/cpu/kernel/iota.hpp | 45 +++++++ src/backend/cpu/kernel/ireduce.hpp | 108 ++++++++++++++++ src/backend/cpu/kernel/join.hpp | 144 +++++++++++++++++++++ src/backend/cpu/kernel/lu.hpp | 80 ++++++++++++ src/backend/cpu/kernel/match_template.hpp | 141 +++++++++++++++++++++ src/backend/cpu/kernel/meanshift.hpp | 138 ++++++++++++++++++++ src/backend/cpu/kernel/medfilt.hpp | 135 ++++++++++++++++++++ src/backend/cpu/kernel/morph.hpp | 140 ++++++++++++++++++++ src/backend/cpu/lu.cpp | 66 +--------- src/backend/cpu/match_template.cpp | 127 +------------------ src/backend/cpu/meanshift.cpp | 121 +----------------- src/backend/cpu/medfilt.cpp | 117 +---------------- src/backend/cpu/morph.cpp | 126 +----------------- src/backend/cpu/utility.hpp | 4 +- 17 files changed, 971 insertions(+), 807 deletions(-) create mode 100644 src/backend/cpu/kernel/iota.hpp create mode 100644 src/backend/cpu/kernel/ireduce.hpp create mode 100644 src/backend/cpu/kernel/join.hpp create mode 100644 src/backend/cpu/kernel/lu.hpp create mode 100644 src/backend/cpu/kernel/match_template.hpp create mode 100644 src/backend/cpu/kernel/meanshift.hpp create mode 100644 src/backend/cpu/kernel/medfilt.hpp create mode 100644 src/backend/cpu/kernel/morph.hpp diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp index dcb85fa787..41f0c9c518 100644 --- a/src/backend/cpu/iota.cpp +++ b/src/backend/cpu/iota.cpp @@ -10,49 +10,15 @@ #include #include #include -#include -#include -#include -#include #include #include +#include using namespace std; namespace cpu { -/////////////////////////////////////////////////////////////////////////// -// Kernel Functions -/////////////////////////////////////////////////////////////////////////// -template -void iota_(Array output, const dim4 &sdims, const dim4 &tdims) -{ - const dim4 dims = output.dims(); - T* out = output.get(); - const dim4 strides = output.strides(); - - for(dim_t w = 0; w < dims[3]; w++) { - dim_t offW = w * strides[3]; - T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2]; - for(dim_t z = 0; z < dims[2]; z++) { - dim_t offWZ = offW + z * strides[2]; - T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1]; - for(dim_t y = 0; y < dims[1]; y++) { - dim_t offWZY = offWZ + y * strides[1]; - T valY = valZ + (y % sdims[1]) * sdims[0]; - for(dim_t x = 0; x < dims[0]; x++) { - dim_t id = offWZY + x; - out[id] = valY + (x % sdims[0]); - } - } - } - } -} - -/////////////////////////////////////////////////////////////////////////// -// Wrapper Functions -/////////////////////////////////////////////////////////////////////////// template Array iota(const dim4 &dims, const dim4 &tile_dims) { @@ -60,7 +26,7 @@ Array iota(const dim4 &dims, const dim4 &tile_dims) Array out = createEmptyArray(outdims); - getQueue().enqueue(iota_, out, dims, tile_dims); + getQueue().enqueue(kernel::iota, out, dims, tile_dims); return out; } diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp index 9858cba665..f1efcf646a 100644 --- a/src/backend/cpu/ireduce.cpp +++ b/src/backend/cpu/ireduce.cpp @@ -13,103 +13,15 @@ #include #include #include - #include #include +#include using af::dim4; namespace cpu { -template double cabs(const T in) { return (double)in; } -static double cabs(const char in) { return (double)(in > 0); } -static double cabs(const cfloat &in) { return (double)abs(in); } -static double cabs(const cdouble &in) { return (double)abs(in); } - -template -struct MinMaxOp -{ - T m_val; - uint m_idx; - MinMaxOp(T val, uint idx) : - m_val(val), m_idx(idx) - { - } - - void operator()(T val, uint idx) - { - if (cabs(val) < cabs(m_val) || - (cabs(val) == cabs(m_val) && - idx > m_idx)) { - m_val = val; - m_idx = idx; - } - } -}; - -template -struct MinMaxOp -{ - T m_val; - uint m_idx; - MinMaxOp(T val, uint idx) : - m_val(val), m_idx(idx) - { - } - - void operator()(T val, uint idx) - { - if (cabs(val) > cabs(m_val) || - (cabs(val) == cabs(m_val) && - idx <= m_idx)) { - m_val = val; - m_idx = idx; - } - } -}; - -template -struct ireduce_dim -{ - void operator()(Array output, Array locArray, const dim_t outOffset, - const Array input, const dim_t inOffset, const int dim) - { - const dim4 odims = output.dims(); - const dim4 ostrides = output.strides(); - const dim4 istrides = input.strides(); - const int D1 = D - 1; - for (dim_t i = 0; i < odims[D1]; i++) { - ireduce_dim()(output, locArray, outOffset + i * ostrides[D1], - input, inOffset + i * istrides[D1], dim); - } - } -}; - -template -struct ireduce_dim -{ - void operator()(Array output, Array locArray, const dim_t outOffset, - const Array input, const dim_t inOffset, const int dim) - { - const dim4 idims = input.dims(); - const dim4 istrides = input.strides(); - - T const * const in = input.get(); - T * out = output.get(); - uint * loc = locArray.get(); - - dim_t stride = istrides[dim]; - MinMaxOp Op(in[0], 0); - for (dim_t i = 0; i < idims[dim]; i++) { - Op(in[inOffset + i * stride], i); - } - - *(out+outOffset) = Op.m_val; - *(loc+outOffset) = Op.m_idx; - } -}; - template using ireduce_dim_func = std::function, Array, const dim_t, const Array, const dim_t, const int)>; @@ -123,10 +35,10 @@ void ireduce(Array &out, Array &loc, const Array &in, const int dim) dim4 odims = in.dims(); odims[dim] = 1; - static const ireduce_dim_func ireduce_funcs[] = { ireduce_dim() - , ireduce_dim() - , ireduce_dim() - , ireduce_dim()}; + static const ireduce_dim_func ireduce_funcs[] = { kernel::ireduce_dim() + , kernel::ireduce_dim() + , kernel::ireduce_dim() + , kernel::ireduce_dim()}; getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); } @@ -141,7 +53,7 @@ T ireduce_all(unsigned *loc, const Array &in) af::dim4 strides = in.strides(); const T *inPtr = in.get(); - MinMaxOp Op(inPtr[0], 0); + kernel::MinMaxOp Op(inPtr[0], 0); for(dim_t l = 0; l < dims[3]; l++) { dim_t off3 = l * strides[3]; diff --git a/src/backend/cpu/join.cpp b/src/backend/cpu/join.cpp index 8af9c24f8d..e39280c943 100644 --- a/src/backend/cpu/join.cpp +++ b/src/backend/cpu/join.cpp @@ -9,50 +9,12 @@ #include #include -#include -#include #include #include +#include namespace cpu { -template -void join_append(To *out, const Tx *X, const af::dim4 &offset, - const af::dim4 &odims, const af::dim4 &xdims, - const af::dim4 &ost, const af::dim4 &xst) -{ - for(dim_t ow = 0; ow < xdims[3]; ow++) { - const dim_t xW = ow * xst[3]; - const dim_t oW = (ow + offset[3]) * ost[3]; - - for(dim_t oz = 0; oz < xdims[2]; oz++) { - const dim_t xZW = xW + oz * xst[2]; - const dim_t oZW = oW + (oz + offset[2]) * ost[2]; - - for(dim_t oy = 0; oy < xdims[1]; oy++) { - const dim_t xYZW = xZW + oy * xst[1]; - const dim_t oYZW = oZW + (oy + offset[1]) * ost[1]; - - for(dim_t ox = 0; ox < xdims[0]; ox++) { - const dim_t iMem = xYZW + ox; - const dim_t oMem = oYZW + (ox + offset[0]); - out[oMem] = X[iMem]; - } - } - } - } -} - -template -af::dim4 calcOffset(const af::dim4 dims) -{ - af::dim4 offset; - offset[0] = (dim == 0) ? dims[0] : 0; - offset[1] = (dim == 1) ? dims[1] : 0; - offset[2] = (dim == 2) ? dims[2] : 0; - offset[3] = (dim == 3) ? dims[3] : 0; - return offset; -} template Array join(const int dim, const Array &first, const Array &second) @@ -76,97 +38,15 @@ Array join(const int dim, const Array &first, const Array &second) Array out = createEmptyArray(odims); - auto func = [=] (Array out, const Array first, const Array second) { - Tx* outPtr = out.get(); - const Tx* fptr = first.get(); - const Ty* sptr = second.get(); - - af::dim4 zero(0,0,0,0); - const af::dim4 odims = out.dims(); - const af::dim4 fdims = first.dims(); - const af::dim4 sdims = second.dims(); - - switch(dim) { - case 0: - join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); - join_append(outPtr, sptr, calcOffset<0>(fdims), - odims, sdims, out.strides(), second.strides()); - break; - case 1: - join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); - join_append(outPtr, sptr, calcOffset<1>(fdims), - odims, sdims, out.strides(), second.strides()); - break; - case 2: - join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); - join_append(outPtr, sptr, calcOffset<2>(fdims), - odims, sdims, out.strides(), second.strides()); - break; - case 3: - join_append(outPtr, fptr, zero, - odims, fdims, out.strides(), first.strides()); - join_append(outPtr, sptr, calcOffset<3>(fdims), - odims, sdims, out.strides(), second.strides()); - break; - } - }; - getQueue().enqueue(func, out, first, second); + getQueue().enqueue(kernel::join, out, dim, first, second); return out; } -template -void join_wrapper(const int dim, Array out, const std::vector> inputs) -{ - af::dim4 zero(0,0,0,0); - af::dim4 d = zero; - switch(dim) { - case 0: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<0>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 1: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<1>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 2: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<2>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - case 3: - join_append(out.get(), inputs[0].get(), zero, - out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); - for(int i = 1; i < n_arrays; i++) { - d += inputs[i - 1].dims(); - join_append(out.get(), inputs[i].get(), calcOffset<3>(d), - out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); - } - break; - } -} - template Array join(const int dim, const std::vector> &inputs) { - for (int i=0; i join(const int dim, const std::vector> &inputs) std::vector idims(n_arrays); dim_t dim_size = 0; - for(int i = 0; i < (int)idims.size(); i++) { + for(unsigned i = 0; i < idims.size(); i++) { idims[i] = inputs[i].dims(); dim_size += idims[i][dim]; } @@ -192,34 +72,34 @@ Array join(const int dim, const std::vector> &inputs) switch(n_arrays) { case 1: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 2: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 3: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 4: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 5: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 6: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 7: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 8: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 9: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 10: - getQueue().enqueue(join_wrapper, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; } diff --git a/src/backend/cpu/kernel/iota.hpp b/src/backend/cpu/kernel/iota.hpp new file mode 100644 index 0000000000..0f824295a4 --- /dev/null +++ b/src/backend/cpu/kernel/iota.hpp @@ -0,0 +1,45 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void iota(Array output, const af::dim4 &sdims, const af::dim4 &tdims) +{ + const af::dim4 dims = output.dims(); + T* out = output.get(); + const af::dim4 strides = output.strides(); + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + T valW = (w % sdims[3]) * sdims[0] * sdims[1] * sdims[2]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + T valZ = valW + (z % sdims[2]) * sdims[0] * sdims[1]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + T valY = valZ + (y % sdims[1]) * sdims[0]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + out[id] = valY + (x % sdims[0]); + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/ireduce.hpp b/src/backend/cpu/kernel/ireduce.hpp new file mode 100644 index 0000000000..1f5a51da62 --- /dev/null +++ b/src/backend/cpu/kernel/ireduce.hpp @@ -0,0 +1,108 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template double cabs(const T in) { return (double)in; } +static double cabs(const char in) { return (double)(in > 0); } +static double cabs(const cfloat &in) { return (double)abs(in); } +static double cabs(const cdouble &in) { return (double)abs(in); } + +template +struct MinMaxOp +{ + T m_val; + uint m_idx; + MinMaxOp(T val, uint idx) : + m_val(val), m_idx(idx) + { + } + + void operator()(T val, uint idx) + { + if (cabs(val) < cabs(m_val) || + (cabs(val) == cabs(m_val) && + idx > m_idx)) { + m_val = val; + m_idx = idx; + } + } +}; + +template +struct MinMaxOp +{ + T m_val; + uint m_idx; + MinMaxOp(T val, uint idx) : + m_val(val), m_idx(idx) + { + } + + void operator()(T val, uint idx) + { + if (cabs(val) > cabs(m_val) || + (cabs(val) == cabs(m_val) && + idx <= m_idx)) { + m_val = val; + m_idx = idx; + } + } +}; + +template +struct ireduce_dim +{ + void operator()(Array output, Array locArray, const dim_t outOffset, + const Array input, const dim_t inOffset, const int dim) + { + const af::dim4 odims = output.dims(); + const af::dim4 ostrides = output.strides(); + const af::dim4 istrides = input.strides(); + const int D1 = D - 1; + for (dim_t i = 0; i < odims[D1]; i++) { + ireduce_dim()(output, locArray, outOffset + i * ostrides[D1], + input, inOffset + i * istrides[D1], dim); + } + } +}; + +template +struct ireduce_dim +{ + void operator()(Array output, Array locArray, const dim_t outOffset, + const Array input, const dim_t inOffset, const int dim) + { + const af::dim4 idims = input.dims(); + const af::dim4 istrides = input.strides(); + + T const * const in = input.get(); + T * out = output.get(); + uint * loc = locArray.get(); + + dim_t stride = istrides[dim]; + MinMaxOp Op(in[0], 0); + for (dim_t i = 0; i < idims[dim]; i++) { + Op(in[inOffset + i * stride], i); + } + + *(out+outOffset) = Op.m_val; + *(loc+outOffset) = Op.m_idx; + } +}; + +} +} diff --git a/src/backend/cpu/kernel/join.hpp b/src/backend/cpu/kernel/join.hpp new file mode 100644 index 0000000000..b0d92c9978 --- /dev/null +++ b/src/backend/cpu/kernel/join.hpp @@ -0,0 +1,144 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +af::dim4 calcOffset(const af::dim4 dims) +{ + af::dim4 offset; + offset[0] = (dim == 0) ? dims[0] : 0; + offset[1] = (dim == 1) ? dims[1] : 0; + offset[2] = (dim == 2) ? dims[2] : 0; + offset[3] = (dim == 3) ? dims[3] : 0; + return offset; +} + +template +void join_append(To *out, const Tx *X, const af::dim4 &offset, + const af::dim4 &odims, const af::dim4 &xdims, + const af::dim4 &ost, const af::dim4 &xst) +{ + for(dim_t ow = 0; ow < xdims[3]; ow++) { + const dim_t xW = ow * xst[3]; + const dim_t oW = (ow + offset[3]) * ost[3]; + + for(dim_t oz = 0; oz < xdims[2]; oz++) { + const dim_t xZW = xW + oz * xst[2]; + const dim_t oZW = oW + (oz + offset[2]) * ost[2]; + + for(dim_t oy = 0; oy < xdims[1]; oy++) { + const dim_t xYZW = xZW + oy * xst[1]; + const dim_t oYZW = oZW + (oy + offset[1]) * ost[1]; + + for(dim_t ox = 0; ox < xdims[0]; ox++) { + const dim_t iMem = xYZW + ox; + const dim_t oMem = oYZW + (ox + offset[0]); + out[oMem] = X[iMem]; + } + } + } + } +} + +template +void join(Array out, const int dim, const Array first, const Array second) +{ + Tx* outPtr = out.get(); + const Tx* fptr = first.get(); + const Ty* sptr = second.get(); + + af::dim4 zero(0,0,0,0); + const af::dim4 odims = out.dims(); + const af::dim4 fdims = first.dims(); + const af::dim4 sdims = second.dims(); + + switch(dim) { + case 0: + join_append(outPtr, fptr, zero, + odims, fdims, out.strides(), first.strides()); + join_append(outPtr, sptr, calcOffset<0>(fdims), + odims, sdims, out.strides(), second.strides()); + break; + case 1: + join_append(outPtr, fptr, zero, + odims, fdims, out.strides(), first.strides()); + join_append(outPtr, sptr, calcOffset<1>(fdims), + odims, sdims, out.strides(), second.strides()); + break; + case 2: + join_append(outPtr, fptr, zero, + odims, fdims, out.strides(), first.strides()); + join_append(outPtr, sptr, calcOffset<2>(fdims), + odims, sdims, out.strides(), second.strides()); + break; + case 3: + join_append(outPtr, fptr, zero, + odims, fdims, out.strides(), first.strides()); + join_append(outPtr, sptr, calcOffset<3>(fdims), + odims, sdims, out.strides(), second.strides()); + break; + } +} + +template +void join(const int dim, Array out, const std::vector> inputs) +{ + af::dim4 zero(0,0,0,0); + af::dim4 d = zero; + switch(dim) { + case 0: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<0>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 1: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<1>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 2: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<2>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + case 3: + join_append(out.get(), inputs[0].get(), zero, + out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); + for(int i = 1; i < n_arrays; i++) { + d += inputs[i - 1].dims(); + join_append(out.get(), inputs[i].get(), calcOffset<3>(d), + out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); + } + break; + } +} + +} +} + diff --git a/src/backend/cpu/kernel/lu.hpp b/src/backend/cpu/kernel/lu.hpp new file mode 100644 index 0000000000..35b0c19b84 --- /dev/null +++ b/src/backend/cpu/kernel/lu.hpp @@ -0,0 +1,80 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void lu_split(Array lower, Array upper, const Array in) +{ + T *l = lower.get(); + T *u = upper.get(); + const T *i = in.get(); + + af::dim4 ldm = lower.dims(); + af::dim4 udm = upper.dims(); + af::dim4 idm = in.dims(); + af::dim4 lst = lower.strides(); + af::dim4 ust = upper.strides(); + af::dim4 ist = in.strides(); + + for(dim_t ow = 0; ow < idm[3]; ow++) { + const dim_t lW = ow * lst[3]; + const dim_t uW = ow * ust[3]; + const dim_t iW = ow * ist[3]; + + for(dim_t oz = 0; oz < idm[2]; oz++) { + const dim_t lZW = lW + oz * lst[2]; + const dim_t uZW = uW + oz * ust[2]; + const dim_t iZW = iW + oz * ist[2]; + + for(dim_t oy = 0; oy < idm[1]; oy++) { + const dim_t lYZW = lZW + oy * lst[1]; + const dim_t uYZW = uZW + oy * ust[1]; + const dim_t iYZW = iZW + oy * ist[1]; + + for(dim_t ox = 0; ox < idm[0]; ox++) { + const dim_t lMem = lYZW + ox; + const dim_t uMem = uYZW + ox; + const dim_t iMem = iYZW + ox; + if(ox > oy) { + if(oy < ldm[1]) l[lMem] = i[iMem]; + if(ox < udm[0]) u[uMem] = scalar(0); + } else if (oy > ox) { + if(oy < ldm[1]) l[lMem] = scalar(0); + if(ox < udm[0]) u[uMem] = i[iMem]; + } else if(ox == oy) { + if(oy < ldm[1]) l[lMem] = scalar(1.0); + if(ox < udm[0]) u[uMem] = i[iMem]; + } + } + } + } + } +} + +void convertPivot(Array p, Array pivot) +{ + int *d_pi = pivot.get(); + int *d_po = p.get(); + dim_t d0 = pivot.dims()[0]; + for(int j = 0; j < (int)d0; j++) { + // 1 indexed in pivot + std::swap(d_po[j], d_po[d_pi[j] - 1]); + } +} + +} +} diff --git a/src/backend/cpu/kernel/match_template.hpp b/src/backend/cpu/kernel/match_template.hpp new file mode 100644 index 0000000000..ae41364018 --- /dev/null +++ b/src/backend/cpu/kernel/match_template.hpp @@ -0,0 +1,141 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void matchTemplate(Array out, const Array sImg, const Array tImg) +{ + const af::dim4 sDims = sImg.dims(); + const af::dim4 tDims = tImg.dims(); + const af::dim4 sStrides = sImg.strides(); + const af::dim4 tStrides = tImg.strides(); + + const dim_t tDim0 = tDims[0]; + const dim_t tDim1 = tDims[1]; + const dim_t sDim0 = sDims[0]; + const dim_t sDim1 = sDims[1]; + + const af::dim4 oStrides = out.strides(); + + OutT tImgMean = OutT(0); + dim_t winNumElements = tImg.elements(); + bool needMean = MatchT==AF_ZSAD || MatchT==AF_LSAD || + MatchT==AF_ZSSD || MatchT==AF_LSSD || + MatchT==AF_ZNCC; + const InT * tpl = tImg.get(); + + if (needMean) { + for(dim_t tj=0; tj +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void meanShift(Array out, const Array in, const float s_sigma, + const float c_sigma, const unsigned iter) +{ + const af::dim4 dims = in.dims(); + const af::dim4 istrides = in.strides(); + const af::dim4 ostrides = out.strides(); + + const dim_t bCount = (IsColor ? 1 : dims[2]); + const dim_t channels = (IsColor ? dims[2] : 1); + + // clamp spatical and chromatic sigma's + float space_ = std::min(11.5f, s_sigma); + const dim_t radius = std::max((int)(space_ * 1.5f), 1); + const float cvar = c_sigma*c_sigma; + + std::vector means; + std::vector centers; + std::vector tmpclrs; + means.reserve(channels); + centers.reserve(channels); + tmpclrs.reserve(channels); + + T *outData = out.get(); + const T * inData = in.get(); + + for(dim_t b3=0; b31 + // i.e for color images where batch is along fourth dimension + centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]]; + } + + // scope of meanshift iterationd begin + for(unsigned it=0; it +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void medfilt(Array out, const Array in, dim_t w_len, dim_t w_wid) +{ + const af::dim4 dims = in.dims(); + const af::dim4 istrides = in.strides(); + const af::dim4 ostrides = out.strides(); + + std::vector wind_vals; + wind_vals.reserve(w_len*w_wid); + + T const * in_ptr = in.get(); + T * out_ptr = out.get(); + + for(int b3=0; b3<(int)dims[3]; b3++) { + + for(int b2=0; b2<(int)dims[2]; b2++) { + + for(int col=0; col<(int)dims[1]; col++) { + + int ocol_off = col*ostrides[1]; + + for(int row=0; row<(int)dims[0]; row++) { + + wind_vals.clear(); + + for(int wj=0; wj<(int)w_wid; ++wj) { + + bool isColOff = false; + + int im_col = col + wj-w_wid/2; + int im_coff; + switch(Pad) { + case AF_PAD_ZERO: + im_coff = im_col * istrides[1]; + if (im_col < 0 || im_col>=(int)dims[1]) + isColOff = true; + break; + case AF_PAD_SYM: + { + if (im_col < 0) { + im_col *= -1; + isColOff = true; + } + + if (im_col>=(int)dims[1]) { + im_col = 2*((int)dims[1]-1) - im_col; + isColOff = true; + } + + im_coff = im_col * istrides[1]; + } + break; + } + + for(int wi=0; wi<(int)w_len; ++wi) { + + bool isRowOff = false; + + int im_row = row + wi-w_len/2; + int im_roff; + switch(Pad) { + case AF_PAD_ZERO: + im_roff = im_row * istrides[0]; + if (im_row < 0 || im_row>=(int)dims[0]) + isRowOff = true; + break; + case AF_PAD_SYM: + { + if (im_row < 0) { + im_row *= -1; + isRowOff = true; + } + + if (im_row>=(int)dims[0]) { + im_row = 2*((int)dims[0]-1) - im_row; + isRowOff = true; + } + + im_roff = im_row * istrides[0]; + } + break; + } + + if(isRowOff || isColOff) { + switch(Pad) { + case AF_PAD_ZERO: + wind_vals.push_back(0); + break; + case AF_PAD_SYM: + wind_vals.push_back(in_ptr[im_coff+im_roff]); + break; + } + } else + wind_vals.push_back(in_ptr[im_coff+im_roff]); + } + } + + std::stable_sort(wind_vals.begin(),wind_vals.end()); + int off = wind_vals.size()/2; + if (wind_vals.size()%2==0) + out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2; + else { + out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off]; + } + } + } + in_ptr += istrides[2]; + out_ptr += ostrides[2]; + } + } +} + + +} +} diff --git a/src/backend/cpu/kernel/morph.hpp b/src/backend/cpu/kernel/morph.hpp new file mode 100644 index 0000000000..af9b7e9373 --- /dev/null +++ b/src/backend/cpu/kernel/morph.hpp @@ -0,0 +1,140 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void morph(Array out, Array const in, Array const mask) +{ + const af::dim4 ostrides = out.strides(); + const af::dim4 istrides = in.strides(); + const af::dim4 fstrides = mask.strides(); + const af::dim4 dims = in.dims(); + const af::dim4 window = mask.dims(); + T* outData = out.get(); + const T* inData = in.get(); + const T* filter = mask.get(); + const dim_t R0 = window[0]/2; + const dim_t R1 = window[1]/2; + + for(dim_t b3=0; b3 (T)0) && offi>=0 && offj>=0 && offi +void morph3d(Array out, Array const in, Array const mask) +{ + const af::dim4 dims = in.dims(); + const af::dim4 window = mask.dims(); + const dim_t R0 = window[0]/2; + const dim_t R1 = window[1]/2; + const dim_t R2 = window[2]/2; + const af::dim4 istrides = in.strides(); + const af::dim4 fstrides = mask.strides(); + const dim_t bCount = dims[3]; + const af::dim4 ostrides = out.strides(); + T* outData = out.get(); + const T* inData = in.get(); + const T* filter = mask.get(); + + for(dim_t batchId=0; batchId (T)0) && offi>=0 && offj>=0 && offk>=0 && + offi #include #include -#include #include #include #include #include +#include namespace cpu { @@ -41,66 +41,6 @@ LU_FUNC(getrf , double , d) LU_FUNC(getrf , cfloat , c) LU_FUNC(getrf , cdouble, z) -template -void lu_split(Array lower, Array upper, const Array in) -{ - T *l = lower.get(); - T *u = upper.get(); - const T *i = in.get(); - - dim4 ldm = lower.dims(); - dim4 udm = upper.dims(); - dim4 idm = in.dims(); - dim4 lst = lower.strides(); - dim4 ust = upper.strides(); - dim4 ist = in.strides(); - - for(dim_t ow = 0; ow < idm[3]; ow++) { - const dim_t lW = ow * lst[3]; - const dim_t uW = ow * ust[3]; - const dim_t iW = ow * ist[3]; - - for(dim_t oz = 0; oz < idm[2]; oz++) { - const dim_t lZW = lW + oz * lst[2]; - const dim_t uZW = uW + oz * ust[2]; - const dim_t iZW = iW + oz * ist[2]; - - for(dim_t oy = 0; oy < idm[1]; oy++) { - const dim_t lYZW = lZW + oy * lst[1]; - const dim_t uYZW = uZW + oy * ust[1]; - const dim_t iYZW = iZW + oy * ist[1]; - - for(dim_t ox = 0; ox < idm[0]; ox++) { - const dim_t lMem = lYZW + ox; - const dim_t uMem = uYZW + ox; - const dim_t iMem = iYZW + ox; - if(ox > oy) { - if(oy < ldm[1]) l[lMem] = i[iMem]; - if(ox < udm[0]) u[uMem] = scalar(0); - } else if (oy > ox) { - if(oy < ldm[1]) l[lMem] = scalar(0); - if(ox < udm[0]) u[uMem] = i[iMem]; - } else if(ox == oy) { - if(oy < ldm[1]) l[lMem] = scalar(1.0); - if(ox < udm[0]) u[uMem] = i[iMem]; - } - } - } - } - } -} - -void convertPivot(Array p, Array pivot) -{ - int *d_pi = pivot.get(); - int *d_po = p.get(); - dim_t d0 = pivot.dims()[0]; - for(int j = 0; j < (int)d0; j++) { - // 1 indexed in pivot - std::swap(d_po[j], d_po[d_pi[j] - 1]); - } -} - template void lu(Array &lower, Array &upper, Array &pivot, const Array &in) { @@ -119,7 +59,7 @@ void lu(Array &lower, Array &upper, Array &pivot, const Array &in) lower = createEmptyArray(ldims); upper = createEmptyArray(udims); - getQueue().enqueue(lu_split, lower, upper, in_copy); + getQueue().enqueue(kernel::lu_split, lower, upper, in_copy); } template @@ -138,7 +78,7 @@ Array lu_inplace(Array &in, const bool convert_pivot) if(convert_pivot) { Array p = range(dim4(iDims[0]), 0); - getQueue().enqueue(convertPivot, p, pivot); + getQueue().enqueue(kernel::convertPivot, p, pivot); return p; } else { return pivot; diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp index d4ce95a691..e5b030be64 100644 --- a/src/backend/cpu/match_template.cpp +++ b/src/backend/cpu/match_template.cpp @@ -12,141 +12,24 @@ #include #include #include -#include #include #include +#include using af::dim4; namespace cpu { -template -Array match_template(const Array &sImg, const Array &tImg) +template +Array match_template(const Array &sImg, const Array &tImg) { sImg.eval(); tImg.eval(); - Array out = createEmptyArray(sImg.dims()); + Array out = createEmptyArray(sImg.dims()); - auto func = [=](Array out, const Array sImg, const Array tImg) { - const dim4 sDims = sImg.dims(); - const dim4 tDims = tImg.dims(); - const dim4 sStrides = sImg.strides(); - const dim4 tStrides = tImg.strides(); - - const dim_t tDim0 = tDims[0]; - const dim_t tDim1 = tDims[1]; - const dim_t sDim0 = sDims[0]; - const dim_t sDim1 = sDims[1]; - - const dim4 oStrides = out.strides(); - - outType tImgMean = outType(0); - dim_t winNumElements = tImg.elements(); - bool needMean = mType==AF_ZSAD || mType==AF_LSAD || - mType==AF_ZSSD || mType==AF_LSSD || - mType==AF_ZNCC; - const inType * tpl = tImg.get(); - - if (needMean) { - for(dim_t tj=0; tj, out, sImg, tImg); return out; } diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp index 62b80e010e..6c3417a62e 100644 --- a/src/backend/cpu/meanshift.cpp +++ b/src/backend/cpu/meanshift.cpp @@ -18,6 +18,7 @@ #include #include #include +#include using af::dim4; using std::vector; @@ -25,11 +26,6 @@ using std::vector; namespace cpu { -inline dim_t clamp(dim_t a, dim_t mn, dim_t mx) -{ - return (amx ? mx : a)); -} - template Array meanshift(const Array &in, const float &s_sigma, const float &c_sigma, const unsigned iter) { @@ -37,120 +33,7 @@ Array meanshift(const Array &in, const float &s_sigma, const float &c_sig Array out = createEmptyArray(in.dims()); - auto func = [=] (Array out, const Array in, const float s_sigma, - const float c_sigma, const unsigned iter) { - const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - const dim4 ostrides = out.strides(); - - const dim_t bCount = (is_color ? 1 : dims[2]); - const dim_t channels = (is_color ? dims[2] : 1); - - // clamp spatical and chromatic sigma's - float space_ = std::min(11.5f, s_sigma); - const dim_t radius = std::max((int)(space_ * 1.5f), 1); - const float cvar = c_sigma*c_sigma; - - vector means; - vector centers; - vector tmpclrs; - means.reserve(channels); - centers.reserve(channels); - tmpclrs.reserve(channels); - - T *outData = out.get(); - const T * inData = in.get(); - - for(dim_t b3=0; b31 - // i.e for color images where batch is along fourth dimension - centers[ch] = inData[j_in_off + i_in_off + ch*istrides[2]]; - } - - // scope of meanshift iterationd begin - for(unsigned it=0; it, out, in, s_sigma, c_sigma, iter); return out; } diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp index 4e74a55fd2..06cc0dff44 100644 --- a/src/backend/cpu/medfilt.cpp +++ b/src/backend/cpu/medfilt.cpp @@ -12,10 +12,9 @@ #include #include #include -#include -#include #include #include +#include using af::dim4; @@ -27,119 +26,9 @@ Array medfilt(const Array &in, dim_t w_len, dim_t w_wid) { in.eval(); - Array out = createEmptyArray(in.dims()); + Array out = createEmptyArray(in.dims()); - auto func = [=] (Array out, const Array in, - dim_t w_len, dim_t w_wid) { - const dim4 dims = in.dims(); - const dim4 istrides = in.strides(); - const dim4 ostrides = out.strides(); - - std::vector wind_vals; - wind_vals.reserve(w_len*w_wid); - - T const * in_ptr = in.get(); - T * out_ptr = out.get(); - - for(int b3=0; b3<(int)dims[3]; b3++) { - - for(int b2=0; b2<(int)dims[2]; b2++) { - - for(int col=0; col<(int)dims[1]; col++) { - - int ocol_off = col*ostrides[1]; - - for(int row=0; row<(int)dims[0]; row++) { - - wind_vals.clear(); - - for(int wj=0; wj<(int)w_wid; ++wj) { - - bool isColOff = false; - - int im_col = col + wj-w_wid/2; - int im_coff; - switch(pad) { - case AF_PAD_ZERO: - im_coff = im_col * istrides[1]; - if (im_col < 0 || im_col>=(int)dims[1]) - isColOff = true; - break; - case AF_PAD_SYM: - { - if (im_col < 0) { - im_col *= -1; - isColOff = true; - } - - if (im_col>=(int)dims[1]) { - im_col = 2*((int)dims[1]-1) - im_col; - isColOff = true; - } - - im_coff = im_col * istrides[1]; - } - break; - } - - for(int wi=0; wi<(int)w_len; ++wi) { - - bool isRowOff = false; - - int im_row = row + wi-w_len/2; - int im_roff; - switch(pad) { - case AF_PAD_ZERO: - im_roff = im_row * istrides[0]; - if (im_row < 0 || im_row>=(int)dims[0]) - isRowOff = true; - break; - case AF_PAD_SYM: - { - if (im_row < 0) { - im_row *= -1; - isRowOff = true; - } - - if (im_row>=(int)dims[0]) { - im_row = 2*((int)dims[0]-1) - im_row; - isRowOff = true; - } - - im_roff = im_row * istrides[0]; - } - break; - } - - if(isRowOff || isColOff) { - switch(pad) { - case AF_PAD_ZERO: - wind_vals.push_back(0); - break; - case AF_PAD_SYM: - wind_vals.push_back(in_ptr[im_coff+im_roff]); - break; - } - } else - wind_vals.push_back(in_ptr[im_coff+im_roff]); - } - } - - std::stable_sort(wind_vals.begin(),wind_vals.end()); - int off = wind_vals.size()/2; - if (wind_vals.size()%2==0) - out_ptr[ocol_off+row*ostrides[0]] = (wind_vals[off]+wind_vals[off-1])/2; - else { - out_ptr[ocol_off+row*ostrides[0]] = wind_vals[off]; - } - } - } - in_ptr += istrides[2]; - out_ptr += ostrides[2]; - } - } - }; - getQueue().enqueue(func, out, in, w_len, w_wid); + getQueue().enqueue(kernel::medfilt, out, in, w_len, w_wid); return out; } diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp index 945c32b310..462319d0af 100644 --- a/src/backend/cpu/morph.cpp +++ b/src/backend/cpu/morph.cpp @@ -15,21 +15,13 @@ #include #include #include +#include using af::dim4; namespace cpu { -static inline unsigned getIdx(const dim4 &strides, - int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i * strides[0]); -} - template Array morph(const Array &in, const Array &mask) { @@ -38,60 +30,7 @@ Array morph(const Array &in, const Array &mask) Array out = createEmptyArray(in.dims()); - auto func = [=] (Array out, const Array in, const Array mask) { - const dim4 ostrides = out.strides(); - const dim4 istrides = in.strides(); - const dim4 fstrides = mask.strides(); - const dim4 dims = in.dims(); - const dim4 window = mask.dims(); - T* outData = out.get(); - const T* inData = in.get(); - const T* filter = mask.get(); - const dim_t R0 = window[0]/2; - const dim_t R1 = window[1]/2; - - for(dim_t b3=0; b3 (T)0) && offi>=0 && offj>=0 && offi, out, in, mask); return out; } @@ -104,66 +43,7 @@ Array morph3d(const Array &in, const Array &mask) Array out = createEmptyArray(in.dims()); - auto func = [=] (Array out, const Array in, const Array mask) { - const dim4 dims = in.dims(); - const dim4 window = mask.dims(); - const dim_t R0 = window[0]/2; - const dim_t R1 = window[1]/2; - const dim_t R2 = window[2]/2; - const dim4 istrides = in.strides(); - const dim4 fstrides = mask.strides(); - const dim_t bCount = dims[3]; - const dim4 ostrides = out.strides(); - T* outData = out.get(); - const T* inData = in.get(); - const T* filter = mask.get(); - - for(dim_t batchId=0; batchId (T)0) && offi>=0 && offj>=0 && offk>=0 && - offi, out, in, mask); return out; } diff --git a/src/backend/cpu/utility.hpp b/src/backend/cpu/utility.hpp index ed8bbd79f7..68cef5a440 100644 --- a/src/backend/cpu/utility.hpp +++ b/src/backend/cpu/utility.hpp @@ -31,9 +31,9 @@ dim_t trimIndex(int const & idx, dim_t const & len) } static inline -dim_t clamp(int a, dim_t mn, dim_t mx) +dim_t clamp(dim_t a, dim_t mn, dim_t mx) { - return (a < (int)mn ? mn : (a > (int)mx ? mx : a)); + return (amx ? mx : a)); } static inline From 7d7f32ffd165f952e85cfe8d711ba147afbbe65d Mon Sep 17 00:00:00 2001 From: pradeep Date: Sat, 19 Dec 2015 16:04:37 -0500 Subject: [PATCH 087/288] moved the left over fns to cpu kernel namespace --- src/backend/cpu/kernel/nearest_neighbour.hpp | 143 +++++ src/backend/cpu/kernel/orb.hpp | 509 +++++++++++++++++ src/backend/cpu/kernel/random.hpp | 143 +++++ src/backend/cpu/kernel/range.hpp | 52 ++ src/backend/cpu/kernel/reduce.hpp | 71 +++ src/backend/cpu/kernel/regions.hpp | 194 +++++++ src/backend/cpu/kernel/reorder.hpp | 55 ++ src/backend/cpu/kernel/resize.hpp | 177 ++++++ src/backend/cpu/kernel/rotate.hpp | 83 +++ src/backend/cpu/kernel/scan.hpp | 72 +++ src/backend/cpu/kernel/select.hpp | 124 +++++ src/backend/cpu/kernel/shift.hpp | 69 +++ src/backend/cpu/{ => kernel}/sift_nonfree.hpp | 0 src/backend/cpu/kernel/sobel.hpp | 86 +++ src/backend/cpu/kernel/sort.hpp | 51 ++ src/backend/cpu/kernel/sort_by_key.hpp | 85 +++ src/backend/cpu/kernel/sort_index.hpp | 70 +++ src/backend/cpu/kernel/susan.hpp | 99 ++++ src/backend/cpu/kernel/tile.hpp | 55 ++ src/backend/cpu/kernel/transform.hpp | 105 ++++ src/backend/cpu/kernel/transpose.hpp | 122 ++++ src/backend/cpu/kernel/triangle.hpp | 61 ++ src/backend/cpu/kernel/unwrap.hpp | 81 +++ src/backend/cpu/kernel/wrap.hpp | 80 +++ src/backend/cpu/nearest_neighbour.cpp | 131 +---- src/backend/cpu/orb.cpp | 520 +----------------- src/backend/cpu/random.cpp | 176 +----- src/backend/cpu/range.cpp | 46 +- src/backend/cpu/reduce.cpp | 60 +- src/backend/cpu/regions.cpp | 175 +----- src/backend/cpu/reorder.cpp | 39 +- src/backend/cpu/resize.cpp | 166 +----- src/backend/cpu/rotate.cpp | 71 +-- src/backend/cpu/scan.cpp | 61 +- src/backend/cpu/select.cpp | 103 +--- src/backend/cpu/shift.cpp | 52 +- src/backend/cpu/sift.cpp | 2 +- src/backend/cpu/sobel.cpp | 71 +-- src/backend/cpu/sort.cpp | 44 +- src/backend/cpu/sort_by_key.cpp | 83 +-- src/backend/cpu/sort_index.cpp | 61 +- src/backend/cpu/susan.cpp | 84 +-- src/backend/cpu/tile.cpp | 38 +- src/backend/cpu/transform.cpp | 93 +--- src/backend/cpu/transform_interp.hpp | 2 + src/backend/cpu/transpose.cpp | 115 +--- src/backend/cpu/triangle.cpp | 42 +- src/backend/cpu/unwrap.cpp | 67 +-- src/backend/cpu/wrap.cpp | 66 +-- 49 files changed, 2691 insertions(+), 2264 deletions(-) create mode 100644 src/backend/cpu/kernel/nearest_neighbour.hpp create mode 100644 src/backend/cpu/kernel/orb.hpp create mode 100644 src/backend/cpu/kernel/random.hpp create mode 100644 src/backend/cpu/kernel/range.hpp create mode 100644 src/backend/cpu/kernel/reduce.hpp create mode 100644 src/backend/cpu/kernel/regions.hpp create mode 100644 src/backend/cpu/kernel/reorder.hpp create mode 100644 src/backend/cpu/kernel/resize.hpp create mode 100644 src/backend/cpu/kernel/rotate.hpp create mode 100644 src/backend/cpu/kernel/scan.hpp create mode 100644 src/backend/cpu/kernel/select.hpp create mode 100644 src/backend/cpu/kernel/shift.hpp rename src/backend/cpu/{ => kernel}/sift_nonfree.hpp (100%) create mode 100644 src/backend/cpu/kernel/sobel.hpp create mode 100644 src/backend/cpu/kernel/sort.hpp create mode 100644 src/backend/cpu/kernel/sort_by_key.hpp create mode 100644 src/backend/cpu/kernel/sort_index.hpp create mode 100644 src/backend/cpu/kernel/susan.hpp create mode 100644 src/backend/cpu/kernel/tile.hpp create mode 100644 src/backend/cpu/kernel/transform.hpp create mode 100644 src/backend/cpu/kernel/transpose.hpp create mode 100644 src/backend/cpu/kernel/triangle.hpp create mode 100644 src/backend/cpu/kernel/unwrap.hpp create mode 100644 src/backend/cpu/kernel/wrap.hpp diff --git a/src/backend/cpu/kernel/nearest_neighbour.hpp b/src/backend/cpu/kernel/nearest_neighbour.hpp new file mode 100644 index 0000000000..4916463aed --- /dev/null +++ b/src/backend/cpu/kernel/nearest_neighbour.hpp @@ -0,0 +1,143 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +#if defined(_WIN32) || defined(_MSC_VER) + +#include +#define __builtin_popcount __popcnt + +#endif + +template +struct dist_op +{ + To operator()(T v1, T v2) + { + return v1 - v2; // Garbage distance + } +}; + +template +struct dist_op +{ + To operator()(T v1, T v2) + { + return std::abs((double)v1 - (double)v2); + } +}; + +template +struct dist_op +{ + To operator()(T v1, T v2) + { + return (v1 - v2) * (v1 - v2); + } +}; + +template +struct dist_op +{ + To operator()(uint v1, uint v2) + { + return __builtin_popcount(v1 ^ v2); + } +}; + +template +struct dist_op +{ + To operator()(uintl v1, uintl v2) + { + return __builtin_popcount(v1 ^ v2); + } +}; + +template +struct dist_op +{ + To operator()(uchar v1, uchar v2) + { + return __builtin_popcount(v1 ^ v2); + } +}; + +template +struct dist_op +{ + To operator()(ushort v1, ushort v2) + { + return __builtin_popcount(v1 ^ v2); + } +}; + +template +void nearest_neighbour(Array idx, Array dist, + const Array query, const Array train, + const uint dist_dim, const uint n_dist) +{ + uint sample_dim = (dist_dim == 0) ? 1 : 0; + const dim4 qDims = query.dims(); + const dim4 tDims = train.dims(); + + const unsigned distLength = qDims[dist_dim]; + const unsigned nQuery = qDims[sample_dim]; + const unsigned nTrain = tDims[sample_dim]; + + const T* qPtr = query.get(); + const T* tPtr = train.get(); + uint* iPtr = idx.get(); + To* dPtr = dist.get(); + + dist_op op; + + for (unsigned i = 0; i < nQuery; i++) { + To best_dist = limit_max(); + unsigned best_idx = 0; + + for (unsigned j = 0; j < nTrain; j++) { + To local_dist = 0; + for (unsigned k = 0; k < distLength; k++) { + size_t qIdx, tIdx; + if (sample_dim == 0) { + qIdx = k * qDims[0] + i; + tIdx = k * tDims[0] + j; + } + else { + qIdx = i * qDims[0] + k; + tIdx = j * tDims[0] + k; + } + + local_dist += op(qPtr[qIdx], tPtr[tIdx]); + } + + if (local_dist < best_dist) { + best_dist = local_dist; + best_idx = j; + } + } + + size_t oIdx; + oIdx = i; + iPtr[oIdx] = best_idx; + dPtr[oIdx] = best_dist; + } +} + +} +} diff --git a/src/backend/cpu/kernel/orb.hpp b/src/backend/cpu/kernel/orb.hpp new file mode 100644 index 0000000000..acd508cb70 --- /dev/null +++ b/src/backend/cpu/kernel/orb.hpp @@ -0,0 +1,509 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +// Reference pattern, generated for a patch size of 31x31, as suggested by +// original ORB paper +#define REF_PAT_SIZE 31 +#define REF_PAT_SAMPLES 256 +#define REF_PAT_COORDS 4 +#define REF_PAT_LENGTH (REF_PAT_SAMPLES*REF_PAT_COORDS) + +// Current reference pattern was borrowed from OpenCV, to build a pattern with +// similar quality, a training process must be applied, as described in +// sections 4.2 and 4.3 of the original ORB paper. +const int ref_pat[REF_PAT_LENGTH] = { + 8,-3, 9,5, + 4,2, 7,-12, + -11,9, -8,2, + 7,-12, 12,-13, + 2,-13, 2,12, + 1,-7, 1,6, + -2,-10, -2,-4, + -13,-13, -11,-8, + -13,-3, -12,-9, + 10,4, 11,9, + -13,-8, -8,-9, + -11,7, -9,12, + 7,7, 12,6, + -4,-5, -3,0, + -13,2, -12,-3, + -9,0, -7,5, + 12,-6, 12,-1, + -3,6, -2,12, + -6,-13, -4,-8, + 11,-13, 12,-8, + 4,7, 5,1, + 5,-3, 10,-3, + 3,-7, 6,12, + -8,-7, -6,-2, + -2,11, -1,-10, + -13,12, -8,10, + -7,3, -5,-3, + -4,2, -3,7, + -10,-12, -6,11, + 5,-12, 6,-7, + 5,-6, 7,-1, + 1,0, 4,-5, + 9,11, 11,-13, + 4,7, 4,12, + 2,-1, 4,4, + -4,-12, -2,7, + -8,-5, -7,-10, + 4,11, 9,12, + 0,-8, 1,-13, + -13,-2, -8,2, + -3,-2, -2,3, + -6,9, -4,-9, + 8,12, 10,7, + 0,9, 1,3, + 7,-5, 11,-10, + -13,-6, -11,0, + 10,7, 12,1, + -6,-3, -6,12, + 10,-9, 12,-4, + -13,8, -8,-12, + -13,0, -8,-4, + 3,3, 7,8, + 5,7, 10,-7, + -1,7, 1,-12, + 3,-10, 5,6, + 2,-4, 3,-10, + -13,0, -13,5, + -13,-7, -12,12, + -13,3, -11,8, + -7,12, -4,7, + 6,-10, 12,8, + -9,-1, -7,-6, + -2,-5, 0,12, + -12,5, -7,5, + 3,-10, 8,-13, + -7,-7, -4,5, + -3,-2, -1,-7, + 2,9, 5,-11, + -11,-13, -5,-13, + -1,6, 0,-1, + 5,-3, 5,2, + -4,-13, -4,12, + -9,-6, -9,6, + -12,-10, -8,-4, + 10,2, 12,-3, + 7,12, 12,12, + -7,-13, -6,5, + -4,9, -3,4, + 7,-1, 12,2, + -7,6, -5,1, + -13,11, -12,5, + -3,7, -2,-6, + 7,-8, 12,-7, + -13,-7, -11,-12, + 1,-3, 12,12, + 2,-6, 3,0, + -4,3, -2,-13, + -1,-13, 1,9, + 7,1, 8,-6, + 1,-1, 3,12, + 9,1, 12,6, + -1,-9, -1,3, + -13,-13, -10,5, + 7,7, 10,12, + 12,-5, 12,9, + 6,3, 7,11, + 5,-13, 6,10, + 2,-12, 2,3, + 3,8, 4,-6, + 2,6, 12,-13, + 9,-12, 10,3, + -8,4, -7,9, + -11,12, -4,-6, + 1,12, 2,-8, + 6,-9, 7,-4, + 2,3, 3,-2, + 6,3, 11,0, + 3,-3, 8,-8, + 7,8, 9,3, + -11,-5, -6,-4, + -10,11, -5,10, + -5,-8, -3,12, + -10,5, -9,0, + 8,-1, 12,-6, + 4,-6, 6,-11, + -10,12, -8,7, + 4,-2, 6,7, + -2,0, -2,12, + -5,-8, -5,2, + 7,-6, 10,12, + -9,-13, -8,-8, + -5,-13, -5,-2, + 8,-8, 9,-13, + -9,-11, -9,0, + 1,-8, 1,-2, + 7,-4, 9,1, + -2,1, -1,-4, + 11,-6, 12,-11, + -12,-9, -6,4, + 3,7, 7,12, + 5,5, 10,8, + 0,-4, 2,8, + -9,12, -5,-13, + 0,7, 2,12, + -1,2, 1,7, + 5,11, 7,-9, + 3,5, 6,-8, + -13,-4, -8,9, + -5,9, -3,-3, + -4,-7, -3,-12, + 6,5, 8,0, + -7,6, -6,12, + -13,6, -5,-2, + 1,-10, 3,10, + 4,1, 8,-4, + -2,-2, 2,-13, + 2,-12, 12,12, + -2,-13, 0,-6, + 4,1, 9,3, + -6,-10, -3,-5, + -3,-13, -1,1, + 7,5, 12,-11, + 4,-2, 5,-7, + -13,9, -9,-5, + 7,1, 8,6, + 7,-8, 7,6, + -7,-4, -7,1, + -8,11, -7,-8, + -13,6, -12,-8, + 2,4, 3,9, + 10,-5, 12,3, + -6,-5, -6,7, + 8,-3, 9,-8, + 2,-12, 2,8, + -11,-2, -10,3, + -12,-13, -7,-9, + -11,0, -10,-5, + 5,-3, 11,8, + -2,-13, -1,12, + -1,-8, 0,9, + -13,-11, -12,-5, + -10,-2, -10,11, + -3,9, -2,-13, + 2,-3, 3,2, + -9,-13, -4,0, + -4,6, -3,-10, + -4,12, -2,-7, + -6,-11, -4,9, + 6,-3, 6,11, + -13,11, -5,5, + 11,11, 12,6, + 7,-5, 12,-2, + -1,12, 0,7, + -4,-8, -3,-2, + -7,1, -6,7, + -13,-12, -8,-13, + -7,-2, -6,-8, + -8,5, -6,-9, + -5,-1, -4,5, + -13,7, -8,10, + 1,5, 5,-13, + 1,0, 10,-13, + 9,12, 10,-1, + 5,-8, 10,-9, + -1,11, 1,-13, + -9,-3, -6,2, + -1,-10, 1,12, + -13,1, -8,-10, + 8,-11, 10,-6, + 2,-13, 3,-6, + 7,-13, 12,-9, + -10,-10, -5,-7, + -10,-8, -8,-13, + 4,-6, 8,5, + 3,12, 8,-13, + -4,2, -3,-3, + 5,-13, 10,-12, + 4,-13, 5,-1, + -9,9, -4,3, + 0,3, 3,-9, + -12,1, -6,1, + 3,2, 4,-8, + -10,-10, -10,9, + 8,-13, 12,12, + -8,-12, -6,-5, + 2,2, 3,7, + 10,6, 11,-8, + 6,8, 8,-12, + -7,10, -6,5, + -3,-9, -3,9, + -1,-13, -1,5, + -3,-7, -3,4, + -8,-2, -8,3, + 4,2, 12,12, + 2,-5, 3,11, + 6,-9, 11,-13, + 3,-1, 7,12, + 11,-1, 12,4, + -3,0, -3,6, + 4,-11, 4,12, + 2,-4, 2,1, + -10,-6, -8,1, + -13,7, -11,1, + -13,12, -11,-13, + 6,0, 11,-13, + 0,-1, 1,4, + -13,3, -9,-2, + -9,8, -6,-3, + -13,-6, -8,-2, + 5,-9, 8,10, + 2,7, 3,-9, + -1,-6, -1,-1, + 9,5, 11,-2, + 11,-3, 12,-8, + 3,0, 3,5, + -1,4, 0,10, + 3,-6, 4,5, + -13,0, -10,5, + 5,8, 12,11, + 8,9, 9,-6, + 7,-4, 8,-12, + -10,4, -10,9, + 7,3, 12,4, + 9,-7, 10,-2, + 7,0, 12,-2, + -1,-6, 0,-11, +}; + +template +void keep_features( + float* x_out, + float* y_out, + float* score_out, + float* size_out, + const float* x_in, + const float* y_in, + const float* score_in, + const unsigned* score_idx, + const float* size_in, + const unsigned n_feat) +{ + // Keep only the first n_feat features + for (unsigned f = 0; f < n_feat; f++) { + x_out[f] = x_in[score_idx[f]]; + y_out[f] = y_in[score_idx[f]]; + score_out[f] = score_in[f]; + if (size_in != nullptr && size_out != nullptr) + size_out[f] = size_in[score_idx[f]]; + } +} + +template +void harris_response( + float* x_out, + float* y_out, + float* score_out, + float* size_out, + const float* x_in, + const float* y_in, + const float* scl_in, + const unsigned total_feat, + unsigned* usable_feat, + const Array& image, + const unsigned block_size, + const float k_thr, + const unsigned patch_size) +{ + const af::dim4 idims = image.dims(); + const T* image_ptr = image.get(); + for (unsigned f = 0; f < total_feat; f++) { + unsigned x, y; + float scl = 1.f; + if (use_scl) { + // Update x and y coordinates according to scale + scl = scl_in[f]; + x = (unsigned)round(x_in[f] * scl); + y = (unsigned)round(y_in[f] * scl); + } + else { + x = (unsigned)round(x_in[f]); + y = (unsigned)round(y_in[f]); + } + + // Round feature size to nearest odd integer + float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f; + + // Avoid keeping features that might be too wide and might not fit on + // the image, sqrt(2.f) is the radius when angle is 45 degrees and + // represents widest case possible + unsigned patch_r = ceil(size * sqrt(2.f) / 2.f); + if (x < patch_r || y < patch_r || x >= idims[1] - patch_r || y >= idims[0] - patch_r) + continue; + + unsigned r = block_size / 2; + + float ixx = 0.f, iyy = 0.f, ixy = 0.f; + unsigned block_size_sq = block_size * block_size; + for (unsigned k = 0; k < block_size_sq; k++) { + int i = k / block_size - r; + int j = k % block_size - r; + + // Calculate local x and y derivatives + float ix = image_ptr[(x+i+1) * idims[0] + y+j] - image_ptr[(x+i-1) * idims[0] + y+j]; + float iy = image_ptr[(x+i) * idims[0] + y+j+1] - image_ptr[(x+i) * idims[0] + y+j-1]; + + // Accumulate second order derivatives + ixx += ix*ix; + iyy += iy*iy; + ixy += ix*iy; + } + + unsigned idx = *usable_feat; + *usable_feat += 1; + float tr = ixx + iyy; + float det = ixx*iyy - ixy*ixy; + + // Calculate Harris responses + float resp = det - k_thr * (tr*tr); + + // Scale factor + // TODO: improve response scaling + float rscale = 0.001f; + rscale = rscale * rscale * rscale * rscale; + + x_out[idx] = x; + y_out[idx] = y; + score_out[idx] = resp * rscale; + if (use_scl) + size_out[idx] = size; + } +} + +template +void centroid_angle( + const float* x_in, + const float* y_in, + float* orientation_out, + const unsigned total_feat, + const Array& image, + const unsigned patch_size) +{ + const af::dim4 idims = image.dims(); + const T* image_ptr = image.get(); + for (unsigned f = 0; f < total_feat; f++) { + unsigned x = (unsigned)round(x_in[f]); + unsigned y = (unsigned)round(y_in[f]); + + unsigned r = patch_size / 2; + if (x < r || y < r || x > idims[1] - r || y > idims[0] - r) + continue; + + T m01 = (T)0, m10 = (T)0; + unsigned patch_size_sq = patch_size * patch_size; + for (unsigned k = 0; k < patch_size_sq; k++) { + int i = k / patch_size - r; + int j = k % patch_size - r; + + // Calculate first order moments + T p = image_ptr[(x+i) * idims[0] + y+j]; + m01 += j * p; + m10 += i * p; + } + + float angle = atan2(m01, m10); + orientation_out[f] = angle; + } +} + +template +inline T get_pixel( + unsigned x, + unsigned y, + const float ori, + const unsigned size, + const int dist_x, + const int dist_y, + const Array& image, + const unsigned patch_size) +{ + const af::dim4 idims = image.dims(); + const T* image_ptr = image.get(); + float ori_sin = sin(ori); + float ori_cos = cos(ori); + float patch_scl = (float)size / (float)patch_size; + + // Calculate point coordinates based on orientation and size + x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin); + y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos); + + return image_ptr[x * idims[0] + y]; +} + +template +void extract_orb( + unsigned* desc_out, + const unsigned n_feat, + float* x_in_out, + float* y_in_out, + const float* ori_in, + float* size_out, + const Array& image, + const float scl, + const unsigned patch_size) +{ + const af::dim4 idims = image.dims(); + for (unsigned f = 0; f < n_feat; f++) { + unsigned x = (unsigned)round(x_in_out[f]); + unsigned y = (unsigned)round(y_in_out[f]); + float ori = ori_in[f]; + unsigned size = patch_size; + + unsigned r = ceil(patch_size * sqrt(2.f) / 2.f); + if (x < r || y < r || x >= idims[1] - r || y >= idims[0] - r) + continue; + + // Descriptor fixed at 256 bits for now + // Storing descriptor as a vector of 8 x 32-bit unsigned numbers + for (unsigned i = 0; i < 8; i++) { + unsigned v = 0; + + // j < 32 for 256 bits descriptor + for (unsigned j = 0; j < 32; j++) { + // Get position from distribution pattern and values of points p1 and p2 + int dist_x = ref_pat[i*32*4 + j*4]; + int dist_y = ref_pat[i*32*4 + j*4+1]; + T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); + + dist_x = ref_pat[i*32*4 + j*4+2]; + dist_y = ref_pat[i*32*4 + j*4+3]; + T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); + + // Calculate bit based on p1 and p2 and shifts it to correct position + v |= (p1 < p2) << j; + } + + // Store 32 bits of descriptor + desc_out[f * 8 + i] += v; + } + + x_in_out[f] = round(x * scl); + y_in_out[f] = round(y * scl); + size_out[f] = patch_size * scl; + } +} + + + +} +} diff --git a/src/backend/cpu/kernel/random.hpp b/src/backend/cpu/kernel/random.hpp new file mode 100644 index 0000000000..357cbd210d --- /dev/null +++ b/src/backend/cpu/kernel/random.hpp @@ -0,0 +1,143 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +using namespace std; + +template +using is_arithmetic_t = typename enable_if< is_arithmetic::value, function>::type; +template +using is_complex_t = typename enable_if< is_complex::value, function>::type; +template +using is_floating_point_t = typename enable_if< is_floating_point::value, function>::type; + +template +is_arithmetic_t +urand(GenType &generator) +{ + typedef typename conditional< is_floating_point::value, + uniform_real_distribution, +#if OS_WIN + uniform_int_distribution>::type dist; +#else + uniform_int_distribution> ::type dist; +#endif + return bind(dist(), generator); +} + +template +is_complex_t +urand(GenType &generator) +{ + auto func = urand(generator); + return [func] () { return T(func(), func());}; +} + +template +is_floating_point_t +nrand(GenType &generator) +{ + return bind(normal_distribution(), generator); +} + +template +is_complex_t +nrand(GenType &generator) +{ + auto func = nrand(generator); + return [func] () { return T(func(), func());}; +} + +static default_random_engine generator; +static unsigned long long gen_seed = 0; +static bool is_first = true; +#define GLOBAL 1 + +template +void randn(Array out) +{ + static unsigned long long my_seed = 0; + if (is_first) { + setSeed(gen_seed); + my_seed = gen_seed; + } + + static auto gen = nrand(generator); + + if (my_seed != gen_seed) { + gen = nrand(generator); + my_seed = gen_seed; + } + + T *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { + outPtr[i] = gen(); + } +} + +template +void randu(Array out) +{ + static unsigned long long my_seed = 0; + if (is_first) { + setSeed(gen_seed); + my_seed = gen_seed; + } + + static auto gen = urand(generator); + + if (my_seed != gen_seed) { + gen = urand(generator); + my_seed = gen_seed; + } + + T *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { + outPtr[i] = gen(); + } +} + +template<> +void randu(Array out) +{ + static unsigned long long my_seed = 0; + if (is_first) { + setSeed(gen_seed); + my_seed = gen_seed; + } + + static auto gen = urand(generator); + + if (my_seed != gen_seed) { + gen = urand(generator); + my_seed = gen_seed; + } + + char *outPtr = out.get(); + for (int i = 0; i < (int)out.elements(); i++) { + outPtr[i] = gen() > 0.5; + } +} + +} +} diff --git a/src/backend/cpu/kernel/range.hpp b/src/backend/cpu/kernel/range.hpp new file mode 100644 index 0000000000..b244a19c85 --- /dev/null +++ b/src/backend/cpu/kernel/range.hpp @@ -0,0 +1,52 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void range(Array output) +{ + T* out = output.get(); + + const dim4 dims = output.dims(); + const dim4 strides = output.strides(); + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + if(dim == 0) { + out[id] = x; + } else if(dim == 1) { + out[id] = y; + } else if(dim == 2) { + out[id] = z; + } else if(dim == 3) { + out[id] = w; + } + } + } + } + } +} + +} +} + diff --git a/src/backend/cpu/kernel/reduce.hpp b/src/backend/cpu/kernel/reduce.hpp new file mode 100644 index 0000000000..85119dcee7 --- /dev/null +++ b/src/backend/cpu/kernel/reduce.hpp @@ -0,0 +1,71 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +struct reduce_dim +{ + void operator()(Array out, const dim_t outOffset, + const Array in, const dim_t inOffset, + const int dim, bool change_nan, double nanval) + { + static const int D1 = D - 1; + static reduce_dim reduce_dim_next; + + const af::dim4 ostrides = out.strides(); + const af::dim4 istrides = in.strides(); + const af::dim4 odims = out.dims(); + + for (dim_t i = 0; i < odims[D1]; i++) { + reduce_dim_next(out, outOffset + i * ostrides[D1], + in, inOffset + i * istrides[D1], + dim, change_nan, nanval); + } + } +}; + +template +struct reduce_dim +{ + + Transform transform; + Binary reduce; + void operator()(Array out, const dim_t outOffset, + const Array in, const dim_t inOffset, + const int dim, bool change_nan, double nanval) + { + const af::dim4 istrides = in.strides(); + const af::dim4 idims = in.dims(); + + To * const outPtr = out.get() + outOffset; + Ti const * const inPtr = in.get() + inOffset; + dim_t stride = istrides[dim]; + + To out_val = reduce.init(); + for (dim_t i = 0; i < idims[dim]; i++) { + To in_val = transform(inPtr[i * stride]); + if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; + out_val = reduce(in_val, out_val); + } + + *outPtr = out_val; + } +}; + + +} +} diff --git a/src/backend/cpu/kernel/regions.hpp b/src/backend/cpu/kernel/regions.hpp new file mode 100644 index 0000000000..863ebc5f48 --- /dev/null +++ b/src/backend/cpu/kernel/regions.hpp @@ -0,0 +1,194 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +class LabelNode +{ +private: + T label; + T minLabel; + unsigned rank; + LabelNode* parent; + +public: + LabelNode() : label(0), minLabel(0), rank(0), parent(this) { } + LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { } + + T getLabel() + { + return label; + } + + T getMinLabel() + { + return minLabel; + } + + LabelNode* getParent() + { + return parent; + } + + unsigned getRank() + { + return rank; + } + + void setMinLabel(T l) + { + minLabel = l; + } + + void setParent(LabelNode* p) + { + parent = p; + } + + void setRank(unsigned r) + { + rank = r; + } +}; + +template +static LabelNode* find(LabelNode* x) +{ + if (x->getParent() != x) + x->setParent(find(x->getParent())); + return x->getParent(); +} + +template +static void setUnion(LabelNode* x, LabelNode* y) +{ + LabelNode* xRoot = find(x); + LabelNode* yRoot = find(y); + if (xRoot == yRoot) + return; + + T xMinLabel = xRoot->getMinLabel(); + T yMinLabel = yRoot->getMinLabel(); + xRoot->setMinLabel(min(xMinLabel, yMinLabel)); + yRoot->setMinLabel(min(xMinLabel, yMinLabel)); + + if (xRoot->getRank() < yRoot->getRank()) + xRoot->setParent(yRoot); + else if (xRoot->getRank() > yRoot->getRank()) + yRoot->setParent(xRoot); + else { + yRoot->setParent(xRoot); + xRoot->setRank(xRoot->getRank() + 1); + } +} + +template +void regions(Array out, const Array in, af_connectivity connectivity) +{ + const af::dim4 in_dims = in.dims(); + const char *in_ptr = in.get(); + T *out_ptr = out.get(); + + // Map labels + typedef typename std::map* > label_map_t; + typedef typename label_map_t::iterator label_map_iterator_t; + + label_map_t lmap; + + // Initial label + T label = (T)1; + + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * in_dims[0] + i; + if (in_ptr[idx] != 0) { + std::vector l; + + // Test neighbors + if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0) + l.push_back(out_ptr[j * in_dims[0] + i-1]); + if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i]); + if (connectivity == AF_CONNECTIVITY_8 && i > 0 && + j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]); + if (connectivity == AF_CONNECTIVITY_8 && + i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0) + l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]); + + if (!l.empty()) { + T minl = l[0]; + for (size_t k = 0; k < l.size(); k++) { + minl = min(l[k], minl); + label_map_iterator_t cur_map = lmap.find(l[k]); + LabelNode *node = cur_map->second; + // Group labels of the same region under a disjoint set + for (size_t m = k+1; m < l.size(); m++) + setUnion(node, lmap.find(l[m])->second); + } + // Set label to smallest neighbor label + out_ptr[idx] = minl; + } + else { + // Insert new label in map + LabelNode *node = new LabelNode(label); + lmap.insert(std::pair* >(label, node)); + out_ptr[idx] = label++; + } + } + } + } + + std::set removed; + + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * (int)in_dims[0] + i; + if (in_ptr[idx] != 0) { + T l = out_ptr[idx]; + label_map_iterator_t cur_map = lmap.find(l); + + if (cur_map != lmap.end()) { + LabelNode* node = cur_map->second; + + LabelNode* node_root = find(node); + out_ptr[idx] = node_root->getMinLabel(); + + // Mark removed labels (those that are part of a region + // that contains a smaller label) + if (node->getMinLabel() < l || node_root->getMinLabel() < l) + removed.insert(l); + if (node->getLabel() > node->getMinLabel()) + removed.insert(node->getLabel()); + } + } + } + } + + // Calculate final neighbors (ensure final labels are sequential) + for (int j = 0; j < (int)in_dims[1]; j++) { + for (int i = 0; i < (int)in_dims[0]; i++) { + int idx = j * (int)in_dims[0] + i; + if (out_ptr[idx] > 0) { + out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx])); + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/reorder.hpp b/src/backend/cpu/kernel/reorder.hpp new file mode 100644 index 0000000000..c10c96ef36 --- /dev/null +++ b/src/backend/cpu/kernel/reorder.hpp @@ -0,0 +1,55 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void reorder(Array out, const Array in, const af::dim4 oDims, const af::dim4 rdims) +{ + T* outPtr = out.get(); + const T* inPtr = in.get(); + + const af::dim4 ist = in.strides(); + const af::dim4 ost = out.strides(); + + + dim_t ids[4] = {0}; + for(dim_t ow = 0; ow < oDims[3]; ow++) { + const dim_t oW = ow * ost[3]; + ids[rdims[3]] = ow; + for(dim_t oz = 0; oz < oDims[2]; oz++) { + const dim_t oZW = oW + oz * ost[2]; + ids[rdims[2]] = oz; + for(dim_t oy = 0; oy < oDims[1]; oy++) { + const dim_t oYZW = oZW + oy * ost[1]; + ids[rdims[1]] = oy; + for(dim_t ox = 0; ox < oDims[0]; ox++) { + const dim_t oIdx = oYZW + ox; + + ids[rdims[0]] = ox; + const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] + + ids[1] * ist[1] + ids[0]; + + outPtr[oIdx] = inPtr[iIdx]; + } + } + } + } +} + +} +} + diff --git a/src/backend/cpu/kernel/resize.hpp b/src/backend/cpu/kernel/resize.hpp new file mode 100644 index 0000000000..19d7ec7cf1 --- /dev/null +++ b/src/backend/cpu/kernel/resize.hpp @@ -0,0 +1,177 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +/** + * noop function for round to avoid compilation + * issues due to lack of this function in C90 based + * compilers, it is only present in C99 and C++11 + * + * This is not a full fledged implementation, this function + * is to be used only for positive numbers, i m using it here + * for calculating dimensions of arrays + */ +dim_t round2int(float value) +{ + return (dim_t)(value+0.5f); +} + +using std::conditional; +using std::is_same; + +template +using wtype_t = typename conditional::value, double, float>::type; + +template +using vtype_t = typename conditional::value, + T, wtype_t + >::type; + +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) + { + return; + } +}; + +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) + { + // Compute Indices + dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0])); + dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1])); + + if (i_x >= idims[0]) i_x = idims[0] - 1; + if (i_y >= idims[1]) i_y = idims[1] - 1; + + dim_t i_off = i_y * istrides[1] + i_x; + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wost = w * ostrides[3]; + dim_t wist = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; + } + } + } +}; + +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) + { + // Compute Indices + float f_x = (float)x / (odims[0] / (float)idims[0]); + float f_y = (float)y / (odims[1] / (float)idims[1]); + + dim_t i1_x = floor(f_x); + dim_t i1_y = floor(f_y); + + if (i1_x >= idims[0]) i1_x = idims[0] - 1; + if (i1_y >= idims[1]) i1_y = idims[1] - 1; + + float b = f_x - i1_x; + float a = f_y - i1_y; + + dim_t i2_x = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1); + dim_t i2_y = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1); + + typedef typename dtype_traits::base_type BT; + typedef wtype_t WT; + typedef vtype_t VT; + + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wst = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + dim_t zst = z * istrides[2]; + dim_t channel_off = zst + wst; + VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off]; + VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off]; + VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off]; + VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off]; + + outPtr[o_off + z * ostrides[2] + w * ostrides[3]] = + scalar((1.0f - a) * (1.0f - b)) * p1 + + scalar(( a ) * (1.0f - b)) * p2 + + scalar((1.0f - a) * ( b )) * p3 + + scalar(( a ) * ( b )) * p4; + } + } + } +}; + +template +struct resize_op +{ + void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, + const af::dim4 &ostrides, const af::dim4 &istrides, + const dim_t x, const dim_t y) + { + // Compute Indices + dim_t i_x = floor((float)x / (odims[0] / (float)idims[0])); + dim_t i_y = floor((float)y / (odims[1] / (float)idims[1])); + + if (i_x >= idims[0]) i_x = idims[0] - 1; + if (i_y >= idims[1]) i_y = idims[1] - 1; + + dim_t i_off = i_y * istrides[1] + i_x; + dim_t o_off = y * ostrides[1] + x; + // Copy values from all channels + for(dim_t w = 0; w < odims[3]; w++) { + dim_t wost = w * ostrides[3]; + dim_t wist = w * istrides[3]; + for(dim_t z = 0; z < odims[2]; z++) { + outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; + } + } + } +}; + +template +void resize(Array out, const Array in) +{ + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + const T *inPtr = in.get(); + T *outPtr = out.get(); + af::dim4 ostrides = out.strides(); + af::dim4 istrides = in.strides(); + + resize_op op; + for(dim_t y = 0; y < odims[1]; y++) { + for(dim_t x = 0; x < odims[0]; x++) { + op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y); + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/rotate.hpp b/src/backend/cpu/kernel/rotate.hpp new file mode 100644 index 0000000000..6e4f75863f --- /dev/null +++ b/src/backend/cpu/kernel/rotate.hpp @@ -0,0 +1,83 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void rotate(Array output, const Array input, const float theta) +{ + const af::dim4 odims = output.dims(); + const af::dim4 idims = input.dims(); + const af::dim4 ostrides = output.strides(); + const af::dim4 istrides = input.strides(); + + const T* in = input.get(); + T* out = output.get(); + dim_t nimages = idims[2]; + + void (*t_fn)(T *, const T *, const float *, const af::dim4 &, + const af::dim4 &, const af::dim4 &, + const dim_t, const dim_t, const dim_t, const dim_t); + + const float c = cos(-theta), s = sin(-theta); + float tx, ty; + { + const float nx = 0.5 * (idims[0] - 1); + const float ny = 0.5 * (idims[1] - 1); + const float mx = 0.5 * (odims[0] - 1); + const float my = 0.5 * (odims[1] - 1); + const float sx = (mx * c + my *-s); + const float sy = (mx * s + my * c); + tx = -(sx - nx); + ty = -(sy - ny); + } + + const float tmat[6] = {std::round( c * 1000) / 1000.0f, + std::round(-s * 1000) / 1000.0f, + std::round(tx * 1000) / 1000.0f, + std::round( s * 1000) / 1000.0f, + std::round( c * 1000) / 1000.0f, + std::round(ty * 1000) / 1000.0f, + }; + + switch(method) { + case AF_INTERP_NEAREST: + t_fn = &transform_n; + break; + case AF_INTERP_BILINEAR: + t_fn = &transform_b; + break; + case AF_INTERP_LOWER: + t_fn = &transform_l; + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } + + + // Do transform for image + for(int yy = 0; yy < (int)odims[1]; yy++) { + for(int xx = 0; xx < (int)odims[0]; xx++) { + t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy); + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/scan.hpp b/src/backend/cpu/kernel/scan.hpp new file mode 100644 index 0000000000..0bcfe7df17 --- /dev/null +++ b/src/backend/cpu/kernel/scan.hpp @@ -0,0 +1,72 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +struct scan_dim +{ + void operator()(Array out, dim_t outOffset, + const Array in, dim_t inOffset, + const int dim) const + { + const dim4 odims = out.dims(); + const dim4 ostrides = out.strides(); + const dim4 istrides = in.strides(); + + const int D1 = D - 1; + for (dim_t i = 0; i < odims[D1]; i++) { + scan_dim func; + getQueue().enqueue(func, + out, outOffset + i * ostrides[D1], + in, inOffset + i * istrides[D1], dim); + if (D1 == dim) break; + } + } +}; + +template +struct scan_dim +{ + void operator()(Array output, dim_t outOffset, + const Array input, dim_t inOffset, + const int dim) const + { + const Ti* in = input.get() + inOffset; + To* out= output.get()+ outOffset; + + const dim4 ostrides = output.strides(); + const dim4 istrides = input.strides(); + const dim4 idims = input.dims(); + + dim_t istride = istrides[dim]; + dim_t ostride = ostrides[dim]; + + Transform transform; + // FIXME: Change the name to something better + Binary scan; + + To out_val = scan.init(); + for (dim_t i = 0; i < idims[dim]; i++) { + To in_val = transform(in[i * istride]); + out_val = scan(in_val, out_val); + out[i * ostride] = out_val; + } + } +}; + +} +} diff --git a/src/backend/cpu/kernel/select.hpp b/src/backend/cpu/kernel/select.hpp new file mode 100644 index 0000000000..1099c7e437 --- /dev/null +++ b/src/backend/cpu/kernel/select.hpp @@ -0,0 +1,124 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void select(Array out, const Array cond, const Array a, const Array b) +{ + af::dim4 adims = a.dims(); + af::dim4 astrides = a.strides(); + af::dim4 bdims = b.dims(); + af::dim4 bstrides = b.strides(); + + af::dim4 cdims = cond.dims(); + af::dim4 cstrides = cond.strides(); + + af::dim4 odims = out.dims(); + af::dim4 ostrides = out.strides(); + + bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1], + adims[2] == odims[2], adims[3] == odims[3]}; + + bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1], + bdims[2] == odims[2], bdims[3] == odims[3]}; + + bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1], + cdims[2] == odims[2], cdims[3] == odims[3]}; + + const T *aptr = a.get(); + const T *bptr = b.get(); + T *optr = out.get(); + const char *cptr = cond.get(); + + for (int l = 0; l < odims[3]; l++) { + + int o_off3 = ostrides[3] * l; + int a_off3 = astrides[3] * is_a_same[3] * l; + int b_off3 = bstrides[3] * is_b_same[3] * l; + int c_off3 = cstrides[3] * is_c_same[3] * l; + + for (int k = 0; k < odims[2]; k++) { + + int o_off2 = ostrides[2] * k + o_off3; + int a_off2 = astrides[2] * is_a_same[2] * k + a_off3; + int b_off2 = bstrides[2] * is_b_same[2] * k + b_off3; + int c_off2 = cstrides[2] * is_c_same[2] * k + c_off3; + + for (int j = 0; j < odims[1]; j++) { + + int o_off1 = ostrides[1] * j + o_off2; + int a_off1 = astrides[1] * is_a_same[1] * j + a_off2; + int b_off1 = bstrides[1] * is_b_same[1] * j + b_off2; + int c_off1 = cstrides[1] * is_c_same[1] * j + c_off2; + + for (int i = 0; i < odims[0]; i++) { + + bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1]; + T aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1]; + T bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1]; + T oval = cval ? aval : bval; + optr[o_off1 + i] = oval; + } + } + } + } +} + +template +void select_scalar(Array out, const Array cond, const Array a, const double b) +{ + af::dim4 astrides = a.strides(); + af::dim4 cstrides = cond.strides(); + + af::dim4 odims = out.dims(); + af::dim4 ostrides = out.strides(); + + const T *aptr = a.get(); + T *optr = out.get(); + const char *cptr = cond.get(); + + for (int l = 0; l < odims[3]; l++) { + + int o_off3 = ostrides[3] * l; + int a_off3 = astrides[3] * l; + int c_off3 = cstrides[3] * l; + + for (int k = 0; k < odims[2]; k++) { + + int o_off2 = ostrides[2] * k + o_off3; + int a_off2 = astrides[2] * k + a_off3; + int c_off2 = cstrides[2] * k + c_off3; + + for (int j = 0; j < odims[1]; j++) { + + int o_off1 = ostrides[1] * j + o_off2; + int a_off1 = astrides[1] * j + a_off2; + int c_off1 = cstrides[1] * j + c_off2; + + for (int i = 0; i < odims[0]; i++) { + + optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b; + } + } + } + } +} + + + +} +} diff --git a/src/backend/cpu/kernel/shift.hpp b/src/backend/cpu/kernel/shift.hpp new file mode 100644 index 0000000000..8beb975486 --- /dev/null +++ b/src/backend/cpu/kernel/shift.hpp @@ -0,0 +1,69 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +static inline dim_t simple_mod(const dim_t i, const dim_t dim) +{ + return (i < dim) ? i : (i - dim); +} + +template +void shift(Array out, const Array in, const af::dim4 sdims) +{ + T* outPtr = out.get(); + const T* inPtr = in.get(); + + const af::dim4 oDims = out.dims(); + const af::dim4 ist = in.strides(); + const af::dim4 ost = out.strides(); + + int sdims_[4]; + // Need to do this because we are mapping output to input in the kernel + for(int i = 0; i < 4; i++) { + // sdims_[i] will always be positive and always [0, oDims[i]]. + // Negative shifts are converted to position by going the other way round + sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0); + assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]); + } + + for(dim_t ow = 0; ow < oDims[3]; ow++) { + const int oW = ow * ost[3]; + const int iw = simple_mod((ow + sdims_[3]), oDims[3]); + const int iW = iw * ist[3]; + for(dim_t oz = 0; oz < oDims[2]; oz++) { + const int oZW = oW + oz * ost[2]; + const int iz = simple_mod((oz + sdims_[2]), oDims[2]); + const int iZW = iW + iz * ist[2]; + for(dim_t oy = 0; oy < oDims[1]; oy++) { + const int oYZW = oZW + oy * ost[1]; + const int iy = simple_mod((oy + sdims_[1]), oDims[1]); + const int iYZW = iZW + iy * ist[1]; + for(dim_t ox = 0; ox < oDims[0]; ox++) { + const int oIdx = oYZW + ox; + const int ix = simple_mod((ox + sdims_[0]), oDims[0]); + const int iIdx = iYZW + ix; + + outPtr[oIdx] = inPtr[iIdx]; + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/kernel/sift_nonfree.hpp similarity index 100% rename from src/backend/cpu/sift_nonfree.hpp rename to src/backend/cpu/kernel/sift_nonfree.hpp diff --git a/src/backend/cpu/kernel/sobel.hpp b/src/backend/cpu/kernel/sobel.hpp new file mode 100644 index 0000000000..49d33cdbb4 --- /dev/null +++ b/src/backend/cpu/kernel/sobel.hpp @@ -0,0 +1,86 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void derivative(Array output, const Array input) +{ + const af::dim4 dims = input.dims(); + const af::dim4 strides = input.strides(); + To* optr = output.get(); + const Ti* iptr = input.get(); + + for(dim_t b3=0; b3=0 && _joff>=0) ? + iptr[_joff*strides[1]+_ioff*strides[0]] : 0; + To SW = (ioff_<(int)dims[0] && _joff>=0) ? + iptr[_joff*strides[1]+ioff_*strides[0]] : 0; + To NE = (_ioff>=0 && joff_<(int)dims[1]) ? + iptr[joff_*strides[1]+_ioff*strides[0]] : 0; + To SE = (ioff_<(int)dims[0] && joff_<(int)dims[1]) ? + iptr[joff_*strides[1]+ioff_*strides[0]] : 0; + + if (isDX) { + To W = _joff>=0 ? + iptr[_joff*strides[1]+ioff*strides[0]] : 0; + + To E = joff_<(int)dims[1] ? + iptr[joff_*strides[1]+ioff*strides[0]] : 0; + + accum = NW+SW - (NE+SE) + 2*(W-E); + } else { + To N = _ioff>=0 ? + iptr[joff*strides[1]+_ioff*strides[0]] : 0; + + To S = ioff_<(int)dims[0] ? + iptr[joff*strides[1]+ioff_*strides[0]] : 0; + + accum = NW+NE - (SW+SE) + 2*(N-S); + } + + optr[joffset+i*strides[0]] = accum; + } + } + + optr += strides[2]; + iptr += strides[2]; + } + optr += strides[3]; + iptr += strides[3]; + } +} + +} +} diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp new file mode 100644 index 0000000000..cba07fabdf --- /dev/null +++ b/src/backend/cpu/kernel/sort.hpp @@ -0,0 +1,51 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +// Based off of http://stackoverflow.com/a/12399290 +template +void sort0(Array val) +{ + // initialize original index locations + T *val_ptr = val.get(); + + function op = std::greater(); + if(isAscending) { op = std::less(); } + + T *comp_ptr = nullptr; + for(dim_t w = 0; w < val.dims()[3]; w++) { + dim_t valW = w * val.strides()[3]; + for(dim_t z = 0; z < val.dims()[2]; z++) { + dim_t valWZ = valW + z * val.strides()[2]; + for(dim_t y = 0; y < val.dims()[1]; y++) { + + dim_t valOffset = valWZ + y * val.strides()[1]; + + comp_ptr = val_ptr + valOffset; + std::sort(comp_ptr, comp_ptr + val.dims()[0], op); + } + } + } + return; +} + +} +} diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp new file mode 100644 index 0000000000..77713a7240 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key.hpp @@ -0,0 +1,85 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void sort0_by_key(Array okey, Array oval, Array oidx, + const Array ikey, const Array ival) +{ + function op = std::greater(); + if(isAscending) { op = std::less(); } + + // Get pointers and initialize original index locations + uint *oidx_ptr = oidx.get(); + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); + const Tk *ikey_ptr = ikey.get(); + const Tv *ival_ptr = ival.get(); + + std::vector seq_vec(oidx.dims()[0]); + std::iota(seq_vec.begin(), seq_vec.end(), 0); + + const Tk *comp_ptr = nullptr; + auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; + + for(dim_t w = 0; w < ikey.dims()[3]; w++) { + dim_t okeyW = w * okey.strides()[3]; + dim_t ovalW = w * oval.strides()[3]; + dim_t oidxW = w * oidx.strides()[3]; + dim_t ikeyW = w * ikey.strides()[3]; + dim_t ivalW = w * ival.strides()[3]; + + for(dim_t z = 0; z < ikey.dims()[2]; z++) { + dim_t okeyWZ = okeyW + z * okey.strides()[2]; + dim_t ovalWZ = ovalW + z * oval.strides()[2]; + dim_t oidxWZ = oidxW + z * oidx.strides()[2]; + dim_t ikeyWZ = ikeyW + z * ikey.strides()[2]; + dim_t ivalWZ = ivalW + z * ival.strides()[2]; + + for(dim_t y = 0; y < ikey.dims()[1]; y++) { + + dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; + dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; + dim_t oidxOffset = oidxWZ + y * oidx.strides()[1]; + dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1]; + dim_t ivalOffset = ivalWZ + y * ival.strides()[1]; + + uint *ptr = oidx_ptr + oidxOffset; + std::copy(seq_vec.begin(), seq_vec.end(), ptr); + + comp_ptr = ikey_ptr + ikeyOffset; + std::stable_sort(ptr, ptr + ikey.dims()[0], comparator); + + for (dim_t i = 0; i < oval.dims()[0]; ++i){ + uint sortIdx = oidx_ptr[oidxOffset + i]; + okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx]; + oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx]; + } + } + } + } + + return; +} + +} +} diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp new file mode 100644 index 0000000000..d2de05a559 --- /dev/null +++ b/src/backend/cpu/kernel/sort_index.hpp @@ -0,0 +1,70 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void sort0_index(Array val, Array idx, const Array in) +{ + // initialize original index locations + uint *idx_ptr = idx.get(); + T *val_ptr = val.get(); + const T *in_ptr = in.get(); + function op = std::greater(); + if(isAscending) { op = std::less(); } + + std::vector seq_vec(idx.dims()[0]); + std::iota(seq_vec.begin(), seq_vec.end(), 0); + + const T *comp_ptr = nullptr; + auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; + + for(dim_t w = 0; w < in.dims()[3]; w++) { + dim_t valW = w * val.strides()[3]; + dim_t idxW = w * idx.strides()[3]; + dim_t inW = w * in.strides()[3]; + for(dim_t z = 0; z < in.dims()[2]; z++) { + dim_t valWZ = valW + z * val.strides()[2]; + dim_t idxWZ = idxW + z * idx.strides()[2]; + dim_t inWZ = inW + z * in.strides()[2]; + for(dim_t y = 0; y < in.dims()[1]; y++) { + + dim_t valOffset = valWZ + y * val.strides()[1]; + dim_t idxOffset = idxWZ + y * idx.strides()[1]; + dim_t inOffset = inWZ + y * in.strides()[1]; + + uint *ptr = idx_ptr + idxOffset; + std::copy(seq_vec.begin(), seq_vec.end(), ptr); + + comp_ptr = in_ptr + inOffset; + std::stable_sort(ptr, ptr + in.dims()[0], comparator); + + for (dim_t i = 0; i < val.dims()[0]; ++i){ + val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]]; + } + } + } + } + + return; +} + +} +} diff --git a/src/backend/cpu/kernel/susan.hpp b/src/backend/cpu/kernel/susan.hpp new file mode 100644 index 0000000000..f543967799 --- /dev/null +++ b/src/backend/cpu/kernel/susan.hpp @@ -0,0 +1,99 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void susan_responses(Array output, const Array input, + const unsigned idim0, const unsigned idim1, + const int radius, const float t, const float g, + const unsigned border_len) +{ + T* resp_out = output.get(); + const T* in = input.get(); + + const unsigned r = border_len; + const int rSqrd = radius*radius; + + for (unsigned y = r; y < idim1 - r; ++y) { + for (unsigned x = r; x < idim0 - r; ++x) { + const unsigned idx = y * idim0 + x; + T m_0 = in[idx]; + float nM = 0.0f; + + for (int i=-radius; i<=radius; ++i) { + for (int j=-radius; j<=radius; ++j) { + if (i*i + j*j < rSqrd) { + int p = x + i; + int q = y + j; + T m = in[p + idim0 * q]; + float exp_pow = std::pow((m - m_0)/t, 6.0); + float cM = std::exp(-exp_pow); + nM += cM; + } + } + } + + resp_out[idx] = nM < g ? g - nM : T(0); + } + } +} + +template +void non_maximal(Array xcoords, Array ycoords, Array response, + shared_ptr counter, const unsigned idim0, const unsigned idim1, + const Array input, const unsigned border_len, const unsigned max_corners) +{ + float* x_out = xcoords.get(); + float* y_out = ycoords.get(); + float* resp_out = response.get(); + unsigned* count = counter.get(); + const T* resp_in= input.get(); + + // Responses on the border don't have 8-neighbors to compare, discard them + const unsigned r = border_len + 1; + + for (unsigned y = r; y < idim1 - r; y++) { + for (unsigned x = r; x < idim0 - r; x++) { + const T v = resp_in[y * idim0 + x]; + + // Find maximum neighborhood response + T max_v; + max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]); + max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]); + max_v = max(max_v, resp_in[(y-1) * idim0 + x ]); + max_v = max(max_v, resp_in[(y+1) * idim0 + x ]); + max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]); + max_v = max(max_v, resp_in[(y) * idim0 + x+1]); + max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]); + + // Stores corner to {x,y,resp}_out if it's response is maximum compared + // to its 8-neighborhood and greater or equal minimum response + if (v > max_v) { + const unsigned idx = *count; + *count += 1; + if (idx < max_corners) { + x_out[idx] = (float)x; + y_out[idx] = (float)y; + resp_out[idx] = (float)v; + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/tile.hpp b/src/backend/cpu/kernel/tile.hpp new file mode 100644 index 0000000000..3ad3009041 --- /dev/null +++ b/src/backend/cpu/kernel/tile.hpp @@ -0,0 +1,55 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void tile(Array out, const Array in) +{ + + T* outPtr = out.get(); + const T* inPtr = in.get(); + + const af::dim4 iDims = in.dims(); + const af::dim4 oDims = out.dims(); + const af::dim4 ist = in.strides(); + const af::dim4 ost = out.strides(); + + for(dim_t ow = 0; ow < oDims[3]; ow++) { + const dim_t iw = ow % iDims[3]; + const dim_t iW = iw * ist[3]; + const dim_t oW = ow * ost[3]; + for(dim_t oz = 0; oz < oDims[2]; oz++) { + const dim_t iz = oz % iDims[2]; + const dim_t iZW = iW + iz * ist[2]; + const dim_t oZW = oW + oz * ost[2]; + for(dim_t oy = 0; oy < oDims[1]; oy++) { + const dim_t iy = oy % iDims[1]; + const dim_t iYZW = iZW + iy * ist[1]; + const dim_t oYZW = oZW + oy * ost[1]; + for(dim_t ox = 0; ox < oDims[0]; ox++) { + const dim_t ix = ox % iDims[0]; + const dim_t iMem = iYZW + ix; + const dim_t oMem = oYZW + ox; + outPtr[oMem] = inPtr[iMem]; + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/transform.hpp b/src/backend/cpu/kernel/transform.hpp new file mode 100644 index 0000000000..d97613a78c --- /dev/null +++ b/src/backend/cpu/kernel/transform.hpp @@ -0,0 +1,105 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void calc_affine_inverse(T *txo, const T *txi) +{ + T det = txi[0]*txi[4] - txi[1]*txi[3]; + + txo[0] = txi[4] / det; + txo[1] = txi[3] / det; + txo[3] = txi[1] / det; + txo[4] = txi[0] / det; + + txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; + txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; +} + +template +void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse) +{ + // The way kernel is structured, it expects an inverse + // transform matrix by default. + // If it is an forward transform, then we need its inverse + if(inverse) { + for(int i = 0; i < 6; i++) + tmat[i] = tmat_ptr[i]; + } else { + calc_affine_inverse(tmat, tmat_ptr); + } +} + +template +void transform(Array output, const Array input, + const Array transform, const bool inverse) +{ + const af::dim4 idims = input.dims(); + const af::dim4 odims = output.dims(); + const af::dim4 istrides = input.strides(); + const af::dim4 ostrides = output.strides(); + + T * out = output.get(); + const T * in = input.get(); + const float* tf = transform.get(); + + dim_t nimages = idims[2]; + // Multiplied in src/backend/transform.cpp + dim_t ntransforms = odims[2] / idims[2]; + + void (*t_fn)(T *, const T *, const float *, const af::dim4 &, + const af::dim4 &, const af::dim4 &, + const dim_t, const dim_t, const dim_t, const dim_t); + + switch(method) { + case AF_INTERP_NEAREST: + t_fn = &transform_n; + break; + case AF_INTERP_BILINEAR: + t_fn = &transform_b; + break; + case AF_INTERP_LOWER: + t_fn = &transform_l; + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } + + + // For each transform channel + for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) { + // Compute inverse if required + const float *tmat_ptr = tf + t_idx * 6; + float tmat[6]; + calc_affine_inverse(tmat, tmat_ptr, inverse); + + // Offset for output pointer + dim_t o_offset = t_idx * nimages * ostrides[2]; + + // Do transform for image + for(int yy = 0; yy < (int)odims[1]; yy++) { + for(int xx = 0; xx < (int)odims[0]; xx++) { + t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy); + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/transpose.hpp b/src/backend/cpu/kernel/transpose.hpp new file mode 100644 index 0000000000..576de873ed --- /dev/null +++ b/src/backend/cpu/kernel/transpose.hpp @@ -0,0 +1,122 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +T getConjugate(const T &in) +{ + // For non-complex types return same + return in; +} + +template<> +cfloat getConjugate(const cfloat &in) +{ + return std::conj(in); +} + +template<> +cdouble getConjugate(const cdouble &in) +{ + return std::conj(in); +} + +template +void transpose(Array output, const Array input) +{ + const dim4 odims = output.dims(); + const dim4 ostrides = output.strides(); + const dim4 istrides = input.strides(); + + T * out = output.get(); + T const * const in = input.get(); + + for (dim_t l = 0; l < odims[3]; ++l) { + for (dim_t k = 0; k < odims[2]; ++k) { + // Outermost loop handles batch mode + // if input has no data along third dimension + // this loop runs only once + for (dim_t j = 0; j < odims[1]; ++j) { + for (dim_t i = 0; i < odims[0]; ++i) { + // calculate array indices based on offsets and strides + // the helper getIdx takes care of indices + const dim_t inIdx = getIdx(istrides,j,i,k,l); + const dim_t outIdx = getIdx(ostrides,i,j,k,l); + if(conjugate) + out[outIdx] = getConjugate(in[inIdx]); + else + out[outIdx] = in[inIdx]; + } + } + // outData and inData pointers doesn't need to be + // offset as the getIdx function is taking care + // of the batch parameter + } + } +} + +template +void transpose(Array out, const Array in, const bool conjugate) +{ + return (conjugate ? transpose(out, in) : transpose(out, in)); +} + +template +void transpose_inplace(Array input) +{ + const dim4 idims = input.dims(); + const dim4 istrides = input.strides(); + + T * in = input.get(); + + for (dim_t l = 0; l < idims[3]; ++l) { + for (dim_t k = 0; k < idims[2]; ++k) { + // Outermost loop handles batch mode + // if input has no data along third dimension + // this loop runs only once + // + // Run only bottom triangle. std::swap swaps with upper triangle + for (dim_t j = 0; j < idims[1]; ++j) { + for (dim_t i = j + 1; i < idims[0]; ++i) { + // calculate array indices based on offsets and strides + // the helper getIdx takes care of indices + const dim_t iIdx = getIdx(istrides,j,i,k,l); + const dim_t oIdx = getIdx(istrides,i,j,k,l); + if(conjugate) { + in[iIdx] = getConjugate(in[iIdx]); + in[oIdx] = getConjugate(in[oIdx]); + std::swap(in[iIdx], in[oIdx]); + } + else { + std::swap(in[iIdx], in[oIdx]); + } + } + } + } + } +} + +template +void transpose_inplace(Array in, const bool conjugate) +{ + return (conjugate ? transpose_inplace(in) : transpose_inplace(in)); +} + +} +} diff --git a/src/backend/cpu/kernel/triangle.hpp b/src/backend/cpu/kernel/triangle.hpp new file mode 100644 index 0000000000..7059de5981 --- /dev/null +++ b/src/backend/cpu/kernel/triangle.hpp @@ -0,0 +1,61 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void triangle(Array out, const Array in) +{ + T *o = out.get(); + const T *i = in.get(); + + af::dim4 odm = out.dims(); + + af::dim4 ost = out.strides(); + af::dim4 ist = in.strides(); + + for(dim_t ow = 0; ow < odm[3]; ow++) { + const dim_t oW = ow * ost[3]; + const dim_t iW = ow * ist[3]; + + for(dim_t oz = 0; oz < odm[2]; oz++) { + const dim_t oZW = oW + oz * ost[2]; + const dim_t iZW = iW + oz * ist[2]; + + for(dim_t oy = 0; oy < odm[1]; oy++) { + const dim_t oYZW = oZW + oy * ost[1]; + const dim_t iYZW = iZW + oy * ist[1]; + + for(dim_t ox = 0; ox < odm[0]; ox++) { + const dim_t oMem = oYZW + ox; + const dim_t iMem = iYZW + ox; + + bool cond = is_upper ? (oy >= ox) : (oy <= ox); + bool do_unit_diag = (is_unit_diag && ox == oy); + if(cond) { + o[oMem] = do_unit_diag ? scalar(1) : i[iMem]; + } else { + o[oMem] = scalar(0); + } + + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/unwrap.hpp b/src/backend/cpu/kernel/unwrap.hpp new file mode 100644 index 0000000000..1d996ff1f3 --- /dev/null +++ b/src/backend/cpu/kernel/unwrap.hpp @@ -0,0 +1,81 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void unwrap_dim(Array out, const Array in, const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) +{ + const T *inPtr = in.get(); + T *outPtr = out.get(); + + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + af::dim4 istrides = in.strides(); + af::dim4 ostrides = out.strides(); + + dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; + + for(dim_t w = 0; w < odims[3]; w++) { + for(dim_t z = 0; z < odims[2]; z++) { + + dim_t cOut = w * ostrides[3] + z * ostrides[2]; + dim_t cIn = w * istrides[3] + z * istrides[2]; + const T* iptr = inPtr + cIn; + T* optr_= outPtr + cOut; + + for(dim_t col = 0; col < odims[d]; col++) { + // Offset output ptr + T* optr = optr_ + col * ostrides[d]; + + // Calculate input window index + dim_t winy = (col / nx); + dim_t winx = (col % nx); + + dim_t startx = winx * sx; + dim_t starty = winy * sy; + + dim_t spx = startx - px; + dim_t spy = starty - py; + + // Short cut condition ensuring all values within input dimensions + bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]); + + for(dim_t y = 0; y < wy; y++) { + for(dim_t x = 0; x < wx; x++) { + dim_t xpad = spx + x; + dim_t ypad = spy + y; + + dim_t oloc = (y * wx + x); + if (d == 0) oloc *= ostrides[1]; + + if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) { + dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]); + optr[oloc] = iptr[iloc]; + } else { + optr[oloc] = scalar(0.0); + } + } + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/kernel/wrap.hpp b/src/backend/cpu/kernel/wrap.hpp new file mode 100644 index 0000000000..70be3ad652 --- /dev/null +++ b/src/backend/cpu/kernel/wrap.hpp @@ -0,0 +1,80 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void wrap_dim(Array out, const Array in, const dim_t wx, const dim_t wy, + const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) +{ + const T *inPtr = in.get(); + T *outPtr = out.get(); + + af::dim4 idims = in.dims(); + af::dim4 odims = out.dims(); + af::dim4 istrides = in.strides(); + af::dim4 ostrides = out.strides(); + + dim_t nx = (odims[0] + 2 * px - wx) / sx + 1; + + for(dim_t w = 0; w < idims[3]; w++) { + for(dim_t z = 0; z < idims[2]; z++) { + + dim_t cIn = w * istrides[3] + z * istrides[2]; + dim_t cOut = w * ostrides[3] + z * ostrides[2]; + const T* iptr_ = inPtr + cIn; + T* optr= outPtr + cOut; + + for(dim_t col = 0; col < idims[d]; col++) { + // Offset output ptr + const T* iptr = iptr_ + col * istrides[d]; + + // Calculate input window index + dim_t winy = (col / nx); + dim_t winx = (col % nx); + + dim_t startx = winx * sx; + dim_t starty = winy * sy; + + dim_t spx = startx - px; + dim_t spy = starty - py; + + // Short cut condition ensuring all values within input dimensions + bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]); + + for(dim_t y = 0; y < wy; y++) { + for(dim_t x = 0; x < wx; x++) { + dim_t xpad = spx + x; + dim_t ypad = spy + y; + + dim_t iloc = (y * wx + x); + if (d == 0) iloc *= istrides[1]; + + if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) { + dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]); + // FIXME: When using threads, atomize this + optr[oloc] += iptr[iloc]; + } + } + } + } + } + } +} + +} +} diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp index b6f50c2e32..82925622ae 100644 --- a/src/backend/cpu/nearest_neighbour.cpp +++ b/src/backend/cpu/nearest_neighbour.cpp @@ -11,139 +11,16 @@ #include #include #include -#include #include #include #include +#include using af::dim4; namespace cpu { -#if defined(_WIN32) || defined(_MSC_VER) - -#include -#define __builtin_popcount __popcnt - -#endif - -template -struct dist_op -{ - To operator()(T v1, T v2) - { - return v1 - v2; // Garbage distance - } -}; - -template -struct dist_op -{ - To operator()(T v1, T v2) - { - return std::abs((double)v1 - (double)v2); - } -}; - -template -struct dist_op -{ - To operator()(T v1, T v2) - { - return (v1 - v2) * (v1 - v2); - } -}; - -template -struct dist_op -{ - To operator()(uint v1, uint v2) - { - return __builtin_popcount(v1 ^ v2); - } -}; - -template -struct dist_op -{ - To operator()(uintl v1, uintl v2) - { - return __builtin_popcount(v1 ^ v2); - } -}; - -template -struct dist_op -{ - To operator()(uchar v1, uchar v2) - { - return __builtin_popcount(v1 ^ v2); - } -}; - -template -struct dist_op -{ - To operator()(ushort v1, ushort v2) - { - return __builtin_popcount(v1 ^ v2); - } -}; - -template -void nearest_neighbour_(Array idx, Array dist, - const Array query, const Array train, - const uint dist_dim, const uint n_dist) -{ - uint sample_dim = (dist_dim == 0) ? 1 : 0; - const dim4 qDims = query.dims(); - const dim4 tDims = train.dims(); - - const unsigned distLength = qDims[dist_dim]; - const unsigned nQuery = qDims[sample_dim]; - const unsigned nTrain = tDims[sample_dim]; - - const T* qPtr = query.get(); - const T* tPtr = train.get(); - uint* iPtr = idx.get(); - To* dPtr = dist.get(); - - dist_op op; - - for (unsigned i = 0; i < nQuery; i++) { - To best_dist = limit_max(); - unsigned best_idx = 0; - - for (unsigned j = 0; j < nTrain; j++) { - To local_dist = 0; - for (unsigned k = 0; k < distLength; k++) { - size_t qIdx, tIdx; - if (sample_dim == 0) { - qIdx = k * qDims[0] + i; - tIdx = k * tDims[0] + j; - } - else { - qIdx = i * qDims[0] + k; - tIdx = j * tDims[0] + k; - } - - local_dist += op(qPtr[qIdx], tPtr[tIdx]); - } - - if (local_dist < best_dist) { - best_dist = local_dist; - best_idx = j; - } - } - - size_t oIdx; - oIdx = i; - iPtr[oIdx] = best_idx; - dPtr[oIdx] = best_dist; - } -} - template void nearest_neighbour(Array& idx, Array& dist, const Array& query, const Array& train, @@ -166,13 +43,13 @@ void nearest_neighbour(Array& idx, Array& dist, switch(dist_type) { case AF_SAD: - getQueue().enqueue(nearest_neighbour_, idx, dist, query, train, dist_dim, n_dist); + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; case AF_SSD: - getQueue().enqueue(nearest_neighbour_, idx, dist, query, train, dist_dim, n_dist); + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; case AF_SHD: - getQueue().enqueue(nearest_neighbour_, idx, dist, query, train, dist_dim, n_dist); + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED); diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp index 4b6629cb3f..00fe8203d4 100644 --- a/src/backend/cpu/orb.cpp +++ b/src/backend/cpu/orb.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -21,520 +20,13 @@ #include #include #include +#include using af::dim4; namespace cpu { -static const float PI_VAL = 3.14159265358979323846f; - -// Reference pattern, generated for a patch size of 31x31, as suggested by -// original ORB paper -#define REF_PAT_SIZE 31 -#define REF_PAT_SAMPLES 256 -#define REF_PAT_COORDS 4 -#define REF_PAT_LENGTH (REF_PAT_SAMPLES*REF_PAT_COORDS) - -// Current reference pattern was borrowed from OpenCV, to build a pattern with -// similar quality, a training process must be applied, as described in -// sections 4.2 and 4.3 of the original ORB paper. -const int ref_pat[REF_PAT_LENGTH] = { - 8,-3, 9,5, - 4,2, 7,-12, - -11,9, -8,2, - 7,-12, 12,-13, - 2,-13, 2,12, - 1,-7, 1,6, - -2,-10, -2,-4, - -13,-13, -11,-8, - -13,-3, -12,-9, - 10,4, 11,9, - -13,-8, -8,-9, - -11,7, -9,12, - 7,7, 12,6, - -4,-5, -3,0, - -13,2, -12,-3, - -9,0, -7,5, - 12,-6, 12,-1, - -3,6, -2,12, - -6,-13, -4,-8, - 11,-13, 12,-8, - 4,7, 5,1, - 5,-3, 10,-3, - 3,-7, 6,12, - -8,-7, -6,-2, - -2,11, -1,-10, - -13,12, -8,10, - -7,3, -5,-3, - -4,2, -3,7, - -10,-12, -6,11, - 5,-12, 6,-7, - 5,-6, 7,-1, - 1,0, 4,-5, - 9,11, 11,-13, - 4,7, 4,12, - 2,-1, 4,4, - -4,-12, -2,7, - -8,-5, -7,-10, - 4,11, 9,12, - 0,-8, 1,-13, - -13,-2, -8,2, - -3,-2, -2,3, - -6,9, -4,-9, - 8,12, 10,7, - 0,9, 1,3, - 7,-5, 11,-10, - -13,-6, -11,0, - 10,7, 12,1, - -6,-3, -6,12, - 10,-9, 12,-4, - -13,8, -8,-12, - -13,0, -8,-4, - 3,3, 7,8, - 5,7, 10,-7, - -1,7, 1,-12, - 3,-10, 5,6, - 2,-4, 3,-10, - -13,0, -13,5, - -13,-7, -12,12, - -13,3, -11,8, - -7,12, -4,7, - 6,-10, 12,8, - -9,-1, -7,-6, - -2,-5, 0,12, - -12,5, -7,5, - 3,-10, 8,-13, - -7,-7, -4,5, - -3,-2, -1,-7, - 2,9, 5,-11, - -11,-13, -5,-13, - -1,6, 0,-1, - 5,-3, 5,2, - -4,-13, -4,12, - -9,-6, -9,6, - -12,-10, -8,-4, - 10,2, 12,-3, - 7,12, 12,12, - -7,-13, -6,5, - -4,9, -3,4, - 7,-1, 12,2, - -7,6, -5,1, - -13,11, -12,5, - -3,7, -2,-6, - 7,-8, 12,-7, - -13,-7, -11,-12, - 1,-3, 12,12, - 2,-6, 3,0, - -4,3, -2,-13, - -1,-13, 1,9, - 7,1, 8,-6, - 1,-1, 3,12, - 9,1, 12,6, - -1,-9, -1,3, - -13,-13, -10,5, - 7,7, 10,12, - 12,-5, 12,9, - 6,3, 7,11, - 5,-13, 6,10, - 2,-12, 2,3, - 3,8, 4,-6, - 2,6, 12,-13, - 9,-12, 10,3, - -8,4, -7,9, - -11,12, -4,-6, - 1,12, 2,-8, - 6,-9, 7,-4, - 2,3, 3,-2, - 6,3, 11,0, - 3,-3, 8,-8, - 7,8, 9,3, - -11,-5, -6,-4, - -10,11, -5,10, - -5,-8, -3,12, - -10,5, -9,0, - 8,-1, 12,-6, - 4,-6, 6,-11, - -10,12, -8,7, - 4,-2, 6,7, - -2,0, -2,12, - -5,-8, -5,2, - 7,-6, 10,12, - -9,-13, -8,-8, - -5,-13, -5,-2, - 8,-8, 9,-13, - -9,-11, -9,0, - 1,-8, 1,-2, - 7,-4, 9,1, - -2,1, -1,-4, - 11,-6, 12,-11, - -12,-9, -6,4, - 3,7, 7,12, - 5,5, 10,8, - 0,-4, 2,8, - -9,12, -5,-13, - 0,7, 2,12, - -1,2, 1,7, - 5,11, 7,-9, - 3,5, 6,-8, - -13,-4, -8,9, - -5,9, -3,-3, - -4,-7, -3,-12, - 6,5, 8,0, - -7,6, -6,12, - -13,6, -5,-2, - 1,-10, 3,10, - 4,1, 8,-4, - -2,-2, 2,-13, - 2,-12, 12,12, - -2,-13, 0,-6, - 4,1, 9,3, - -6,-10, -3,-5, - -3,-13, -1,1, - 7,5, 12,-11, - 4,-2, 5,-7, - -13,9, -9,-5, - 7,1, 8,6, - 7,-8, 7,6, - -7,-4, -7,1, - -8,11, -7,-8, - -13,6, -12,-8, - 2,4, 3,9, - 10,-5, 12,3, - -6,-5, -6,7, - 8,-3, 9,-8, - 2,-12, 2,8, - -11,-2, -10,3, - -12,-13, -7,-9, - -11,0, -10,-5, - 5,-3, 11,8, - -2,-13, -1,12, - -1,-8, 0,9, - -13,-11, -12,-5, - -10,-2, -10,11, - -3,9, -2,-13, - 2,-3, 3,2, - -9,-13, -4,0, - -4,6, -3,-10, - -4,12, -2,-7, - -6,-11, -4,9, - 6,-3, 6,11, - -13,11, -5,5, - 11,11, 12,6, - 7,-5, 12,-2, - -1,12, 0,7, - -4,-8, -3,-2, - -7,1, -6,7, - -13,-12, -8,-13, - -7,-2, -6,-8, - -8,5, -6,-9, - -5,-1, -4,5, - -13,7, -8,10, - 1,5, 5,-13, - 1,0, 10,-13, - 9,12, 10,-1, - 5,-8, 10,-9, - -1,11, 1,-13, - -9,-3, -6,2, - -1,-10, 1,12, - -13,1, -8,-10, - 8,-11, 10,-6, - 2,-13, 3,-6, - 7,-13, 12,-9, - -10,-10, -5,-7, - -10,-8, -8,-13, - 4,-6, 8,5, - 3,12, 8,-13, - -4,2, -3,-3, - 5,-13, 10,-12, - 4,-13, 5,-1, - -9,9, -4,3, - 0,3, 3,-9, - -12,1, -6,1, - 3,2, 4,-8, - -10,-10, -10,9, - 8,-13, 12,12, - -8,-12, -6,-5, - 2,2, 3,7, - 10,6, 11,-8, - 6,8, 8,-12, - -7,10, -6,5, - -3,-9, -3,9, - -1,-13, -1,5, - -3,-7, -3,4, - -8,-2, -8,3, - 4,2, 12,12, - 2,-5, 3,11, - 6,-9, 11,-13, - 3,-1, 7,12, - 11,-1, 12,4, - -3,0, -3,6, - 4,-11, 4,12, - 2,-4, 2,1, - -10,-6, -8,1, - -13,7, -11,1, - -13,12, -11,-13, - 6,0, 11,-13, - 0,-1, 1,4, - -13,3, -9,-2, - -9,8, -6,-3, - -13,-6, -8,-2, - 5,-9, 8,10, - 2,7, 3,-9, - -1,-6, -1,-1, - 9,5, 11,-2, - 11,-3, 12,-8, - 3,0, 3,5, - -1,4, 0,10, - 3,-6, 4,5, - -13,0, -10,5, - 5,8, 12,11, - 8,9, 9,-6, - 7,-4, 8,-12, - -10,4, -10,9, - 7,3, 12,4, - 9,-7, 10,-2, - 7,0, 12,-2, - -1,-6, 0,-11, -}; - -template -void gaussian1D(T* out, const int dim, double sigma=0.0) -{ - if(!(sigma>0)) sigma = 0.25*dim; - - T sum = (T)0; - for(int i=0;i -void keep_features( - float* x_out, - float* y_out, - float* score_out, - float* size_out, - const float* x_in, - const float* y_in, - const float* score_in, - const unsigned* score_idx, - const float* size_in, - const unsigned n_feat) -{ - // Keep only the first n_feat features - for (unsigned f = 0; f < n_feat; f++) { - x_out[f] = x_in[score_idx[f]]; - y_out[f] = y_in[score_idx[f]]; - score_out[f] = score_in[f]; - if (size_in != nullptr && size_out != nullptr) - size_out[f] = size_in[score_idx[f]]; - } -} - -template -void harris_response( - float* x_out, - float* y_out, - float* score_out, - float* size_out, - const float* x_in, - const float* y_in, - const float* scl_in, - const unsigned total_feat, - unsigned* usable_feat, - const Array& image, - const unsigned block_size, - const float k_thr, - const unsigned patch_size) -{ - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); - for (unsigned f = 0; f < total_feat; f++) { - unsigned x, y; - float scl = 1.f; - if (use_scl) { - // Update x and y coordinates according to scale - scl = scl_in[f]; - x = (unsigned)round(x_in[f] * scl); - y = (unsigned)round(y_in[f] * scl); - } - else { - x = (unsigned)round(x_in[f]); - y = (unsigned)round(y_in[f]); - } - - // Round feature size to nearest odd integer - float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f; - - // Avoid keeping features that might be too wide and might not fit on - // the image, sqrt(2.f) is the radius when angle is 45 degrees and - // represents widest case possible - unsigned patch_r = ceil(size * sqrt(2.f) / 2.f); - if (x < patch_r || y < patch_r || x >= idims[1] - patch_r || y >= idims[0] - patch_r) - continue; - - unsigned r = block_size / 2; - - float ixx = 0.f, iyy = 0.f, ixy = 0.f; - unsigned block_size_sq = block_size * block_size; - for (unsigned k = 0; k < block_size_sq; k++) { - int i = k / block_size - r; - int j = k % block_size - r; - - // Calculate local x and y derivatives - float ix = image_ptr[(x+i+1) * idims[0] + y+j] - image_ptr[(x+i-1) * idims[0] + y+j]; - float iy = image_ptr[(x+i) * idims[0] + y+j+1] - image_ptr[(x+i) * idims[0] + y+j-1]; - - // Accumulate second order derivatives - ixx += ix*ix; - iyy += iy*iy; - ixy += ix*iy; - } - - unsigned idx = *usable_feat; - *usable_feat += 1; - float tr = ixx + iyy; - float det = ixx*iyy - ixy*ixy; - - // Calculate Harris responses - float resp = det - k_thr * (tr*tr); - - // Scale factor - // TODO: improve response scaling - float rscale = 0.001f; - rscale = rscale * rscale * rscale * rscale; - - x_out[idx] = x; - y_out[idx] = y; - score_out[idx] = resp * rscale; - if (use_scl) - size_out[idx] = size; - } -} - -template -void centroid_angle( - const float* x_in, - const float* y_in, - float* orientation_out, - const unsigned total_feat, - const Array& image, - const unsigned patch_size) -{ - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); - for (unsigned f = 0; f < total_feat; f++) { - unsigned x = (unsigned)round(x_in[f]); - unsigned y = (unsigned)round(y_in[f]); - - unsigned r = patch_size / 2; - if (x < r || y < r || x > idims[1] - r || y > idims[0] - r) - continue; - - T m01 = (T)0, m10 = (T)0; - unsigned patch_size_sq = patch_size * patch_size; - for (unsigned k = 0; k < patch_size_sq; k++) { - int i = k / patch_size - r; - int j = k % patch_size - r; - - // Calculate first order moments - T p = image_ptr[(x+i) * idims[0] + y+j]; - m01 += j * p; - m10 += i * p; - } - - float angle = atan2(m01, m10); - orientation_out[f] = angle; - } -} - -template -inline T get_pixel( - unsigned x, - unsigned y, - const float ori, - const unsigned size, - const int dist_x, - const int dist_y, - const Array& image, - const unsigned patch_size) -{ - const af::dim4 idims = image.dims(); - const T* image_ptr = image.get(); - float ori_sin = sin(ori); - float ori_cos = cos(ori); - float patch_scl = (float)size / (float)patch_size; - - // Calculate point coordinates based on orientation and size - x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin); - y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos); - - return image_ptr[x * idims[0] + y]; -} - -template -void extract_orb( - unsigned* desc_out, - const unsigned n_feat, - float* x_in_out, - float* y_in_out, - const float* ori_in, - float* size_out, - const Array& image, - const float scl, - const unsigned patch_size) -{ - const af::dim4 idims = image.dims(); - for (unsigned f = 0; f < n_feat; f++) { - unsigned x = (unsigned)round(x_in_out[f]); - unsigned y = (unsigned)round(y_in_out[f]); - float ori = ori_in[f]; - unsigned size = patch_size; - - unsigned r = ceil(patch_size * sqrt(2.f) / 2.f); - if (x < r || y < r || x >= idims[1] - r || y >= idims[0] - r) - continue; - - // Descriptor fixed at 256 bits for now - // Storing descriptor as a vector of 8 x 32-bit unsigned numbers - for (unsigned i = 0; i < 8; i++) { - unsigned v = 0; - - // j < 32 for 256 bits descriptor - for (unsigned j = 0; j < 32; j++) { - // Get position from distribution pattern and values of points p1 and p2 - int dist_x = ref_pat[i*32*4 + j*4]; - int dist_y = ref_pat[i*32*4 + j*4+1]; - T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); - - dist_x = ref_pat[i*32*4 + j*4+2]; - dist_y = ref_pat[i*32*4 + j*4+3]; - T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); - - // Calculate bit based on p1 and p2 and shifts it to correct position - v |= (p1 < p2) << j; - } - - // Store 32 bits of descriptor - desc_out[f * 8 + i] += v; - } - - x_in_out[f] = round(x * scl); - y_in_out[f] = round(y * scl); - size_out[f] = patch_size * scl; - } -} - - - template unsigned orb(Array &x, Array &y, Array &score, Array &ori, @@ -652,7 +144,7 @@ unsigned orb(Array &x, Array &y, // Calculate Harris responses // Good block_size >= 7 (must be an odd number) unsigned usable_feat = 0; - harris_response(h_x_harris, h_y_harris, h_score_harris, nullptr, + kernel::harris_response(h_x_harris, h_y_harris, h_score_harris, nullptr, h_x_feat, h_y_feat, nullptr, lvl_feat, &usable_feat, lvl_img, @@ -689,7 +181,7 @@ unsigned orb(Array &x, Array &y, float* h_score_lvl = memAlloc(usable_feat); // Keep only features with higher Harris responses - keep_features(h_x_lvl, h_y_lvl, h_score_lvl, nullptr, + kernel::keep_features(h_x_lvl, h_y_lvl, h_score_lvl, nullptr, h_x_harris, h_y_harris, harris_sorted.get(), harris_idx.get(), nullptr, usable_feat); @@ -700,7 +192,7 @@ unsigned orb(Array &x, Array &y, float* h_size_lvl = memAlloc(usable_feat); // Compute orientation of features - centroid_angle(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat, + kernel::centroid_angle(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat, lvl_img, patch_size); Array lvl_filt = createEmptyArray(dim4()); @@ -723,11 +215,11 @@ unsigned orb(Array &x, Array &y, unsigned* h_desc_lvl = memAlloc(usable_feat * 8); memset(h_desc_lvl, 0, usable_feat * 8 * sizeof(unsigned)); if (blur_img) - extract_orb(h_desc_lvl, usable_feat, + kernel::extract_orb(h_desc_lvl, usable_feat, h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl, lvl_filt, lvl_scl, patch_size); else - extract_orb(h_desc_lvl, usable_feat, + kernel::extract_orb(h_desc_lvl, usable_feat, h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl, lvl_img, lvl_scl, patch_size); diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp index 8c83ad68ae..55cf2956a8 100644 --- a/src/backend/cpu/random.cpp +++ b/src/backend/cpu/random.cpp @@ -7,12 +7,6 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include -#include -#include -#include -#include -#include #include #include #include @@ -20,140 +14,16 @@ #include #include #include +#include namespace cpu { -using namespace std; - -template -using is_arithmetic_t = typename enable_if< is_arithmetic::value, function>::type; -template -using is_complex_t = typename enable_if< is_complex::value, function>::type; -template -using is_floating_point_t = typename enable_if< is_floating_point::value, function>::type; - -template -is_arithmetic_t -urand(GenType &generator) -{ - typedef typename conditional< is_floating_point::value, - uniform_real_distribution, -#if OS_WIN - uniform_int_distribution>::type dist; -#else - uniform_int_distribution> ::type dist; -#endif - return bind(dist(), generator); -} - -template -is_complex_t -urand(GenType &generator) -{ - auto func = urand(generator); - return [func] () { return T(func(), func());}; -} - -template -is_floating_point_t -nrand(GenType &generator) -{ - return bind(normal_distribution(), generator); -} - -template -is_complex_t -nrand(GenType &generator) -{ - auto func = nrand(generator); - return [func] () { return T(func(), func());}; -} - -static default_random_engine generator; -static unsigned long long gen_seed = 0; -static bool is_first = true; -#define GLOBAL 1 - -template -void randn_(Array out) -{ - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; - } - - static auto gen = nrand(generator); - - if (my_seed != gen_seed) { - gen = nrand(generator); - my_seed = gen_seed; - } - - T *outPtr = out.get(); - for (int i = 0; i < (int)out.elements(); i++) { - outPtr[i] = gen(); - } -} - -template -Array randn(const af::dim4 &dims) -{ - Array outArray = createEmptyArray(dims); - getQueue().enqueue(randn_, outArray); - return outArray; -} - -template -void randu_(Array out) -{ - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; - } - - static auto gen = urand(generator); - - if (my_seed != gen_seed) { - gen = urand(generator); - my_seed = gen_seed; - } - - T *outPtr = out.get(); - for (int i = 0; i < (int)out.elements(); i++) { - outPtr[i] = gen(); - } -} - -template<> -void randu_(Array out) -{ - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; - } - - static auto gen = urand(generator); - - if (my_seed != gen_seed) { - gen = urand(generator); - my_seed = gen_seed; - } - - char *outPtr = out.get(); - for (int i = 0; i < (int)out.elements(); i++) { - outPtr[i] = gen() > 0.5; - } -} - template Array randu(const af::dim4 &dims) { Array outArray = createEmptyArray(dims); - getQueue().enqueue(randu_, outArray); + getQueue().enqueue(kernel::randu, outArray); return outArray; } @@ -172,6 +42,14 @@ INSTANTIATE_UNIFORM(uchar) INSTANTIATE_UNIFORM(short) INSTANTIATE_UNIFORM(ushort) +template +Array randn(const af::dim4 &dims) +{ + Array outArray = createEmptyArray(dims); + getQueue().enqueue(kernel::randn, outArray); + return outArray; +} + #define INSTANTIATE_NORMAL(T) \ template Array randn(const af::dim4 &dims); @@ -184,32 +62,36 @@ template<> Array randu(const af::dim4 &dims) { static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; + if (kernel::is_first) { + setSeed(kernel::gen_seed); + my_seed = kernel::gen_seed; } - static auto gen = urand(generator); + static auto gen = kernel::urand(kernel::generator); - if (my_seed != gen_seed) { - gen = urand(generator); - my_seed = gen_seed; + if (my_seed != kernel::gen_seed) { + gen = kernel::urand(kernel::generator); + my_seed = kernel::gen_seed; } Array outArray = createEmptyArray(dims); - char *outPtr = outArray.get(); - for (int i = 0; i < (int)outArray.elements(); i++) { - outPtr[i] = gen() > 0.5; - } + auto func = [=](Array outArray) { + char *outPtr = outArray.get(); + for (int i = 0; i < (int)outArray.elements(); i++) { + outPtr[i] = gen() > 0.5; + } + }; + getQueue().enqueue(func, outArray); + return outArray; } void setSeed(const uintl seed) { auto f = [=](const uintl seed){ - generator.seed(seed); - is_first = false; - gen_seed = seed; + kernel::generator.seed(seed); + kernel::is_first = false; + kernel::gen_seed = seed; }; getQueue().enqueue(f, seed); } @@ -217,7 +99,7 @@ void setSeed(const uintl seed) uintl getSeed() { getQueue().sync(); - return gen_seed; + return kernel::gen_seed; } } diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp index 7837db51ff..b5ba5f89c4 100644 --- a/src/backend/cpu/range.cpp +++ b/src/backend/cpu/range.cpp @@ -16,47 +16,11 @@ #include #include #include +#include namespace cpu { -/////////////////////////////////////////////////////////////////////////// -// Kernel Functions -/////////////////////////////////////////////////////////////////////////// -template -void range(Array output) -{ - T* out = output.get(); - - const dim4 dims = output.dims(); - const dim4 strides = output.strides(); - - for(dim_t w = 0; w < dims[3]; w++) { - dim_t offW = w * strides[3]; - for(dim_t z = 0; z < dims[2]; z++) { - dim_t offWZ = offW + z * strides[2]; - for(dim_t y = 0; y < dims[1]; y++) { - dim_t offWZY = offWZ + y * strides[1]; - for(dim_t x = 0; x < dims[0]; x++) { - dim_t id = offWZY + x; - if(dim == 0) { - out[id] = x; - } else if(dim == 1) { - out[id] = y; - } else if(dim == 2) { - out[id] = z; - } else if(dim == 3) { - out[id] = w; - } - } - } - } - } -} - -/////////////////////////////////////////////////////////////////////////// -// Wrapper Functions -/////////////////////////////////////////////////////////////////////////// template Array range(const dim4& dims, const int seq_dim) { @@ -69,10 +33,10 @@ Array range(const dim4& dims, const int seq_dim) Array out = createEmptyArray(dims); switch(_seq_dim) { - case 0: getQueue().enqueue(range, out); break; - case 1: getQueue().enqueue(range, out); break; - case 2: getQueue().enqueue(range, out); break; - case 3: getQueue().enqueue(range, out); break; + case 0: getQueue().enqueue(kernel::range, out); break; + case 1: getQueue().enqueue(kernel::range, out); break; + case 2: getQueue().enqueue(kernel::range, out); break; + case 3: getQueue().enqueue(kernel::range, out); break; default : AF_ERROR("Invalid rep selection", AF_ERR_ARG); } diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp index cce12268e8..cd44b5e2d0 100644 --- a/src/backend/cpu/reduce.cpp +++ b/src/backend/cpu/reduce.cpp @@ -15,9 +15,9 @@ #include #include #include - #include #include +#include using af::dim4; @@ -38,56 +38,6 @@ struct Binary namespace cpu { -template -struct reduce_dim -{ - void operator()(Array out, const dim_t outOffset, - const Array in, const dim_t inOffset, - const int dim, bool change_nan, double nanval) - { - static const int D1 = D - 1; - static reduce_dim reduce_dim_next; - - const dim4 ostrides = out.strides(); - const dim4 istrides = in.strides(); - const dim4 odims = out.dims(); - - for (dim_t i = 0; i < odims[D1]; i++) { - reduce_dim_next(out, outOffset + i * ostrides[D1], - in, inOffset + i * istrides[D1], - dim, change_nan, nanval); - } - } -}; - -template -struct reduce_dim -{ - - Transform transform; - Binary reduce; - void operator()(Array out, const dim_t outOffset, - const Array in, const dim_t inOffset, - const int dim, bool change_nan, double nanval) - { - const dim4 istrides = in.strides(); - const dim4 idims = in.dims(); - - To * const outPtr = out.get() + outOffset; - Ti const * const inPtr = in.get() + inOffset; - dim_t stride = istrides[dim]; - - To out_val = reduce.init(); - for (dim_t i = 0; i < idims[dim]; i++) { - To in_val = transform(inPtr[i * stride]); - if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; - out_val = reduce(in_val, out_val); - } - - *outPtr = out_val; - } -}; - template using reduce_dim_func = std::function, const dim_t, const Array, const dim_t, @@ -101,10 +51,10 @@ Array reduce(const Array &in, const int dim, bool change_nan, double nan in.eval(); Array out = createEmptyArray(odims); - static const reduce_dim_func reduce_funcs[4] = { reduce_dim() - , reduce_dim() - , reduce_dim() - , reduce_dim()}; + static const reduce_dim_func reduce_funcs[4] = { kernel::reduce_dim() + , kernel::reduce_dim() + , kernel::reduce_dim() + , kernel::reduce_dim()}; getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval); diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp index f7309c8dbe..ffac11c01d 100644 --- a/src/backend/cpu/regions.cpp +++ b/src/backend/cpu/regions.cpp @@ -19,193 +19,22 @@ #include #include #include +#include using af::dim4; namespace cpu { -template -class LabelNode -{ -private: - T label; - T minLabel; - unsigned rank; - LabelNode* parent; - -public: - LabelNode() : label(0), minLabel(0), rank(0), parent(this) { } - LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { } - - T getLabel() - { - return label; - } - - T getMinLabel() - { - return minLabel; - } - - LabelNode* getParent() - { - return parent; - } - - unsigned getRank() - { - return rank; - } - - void setMinLabel(T l) - { - minLabel = l; - } - - void setParent(LabelNode* p) - { - parent = p; - } - - void setRank(unsigned r) - { - rank = r; - } -}; - -template -static LabelNode* find(LabelNode* x) -{ - if (x->getParent() != x) - x->setParent(find(x->getParent())); - return x->getParent(); -} - -template -static void setUnion(LabelNode* x, LabelNode* y) -{ - LabelNode* xRoot = find(x); - LabelNode* yRoot = find(y); - if (xRoot == yRoot) - return; - - T xMinLabel = xRoot->getMinLabel(); - T yMinLabel = yRoot->getMinLabel(); - xRoot->setMinLabel(min(xMinLabel, yMinLabel)); - yRoot->setMinLabel(min(xMinLabel, yMinLabel)); - - if (xRoot->getRank() < yRoot->getRank()) - xRoot->setParent(yRoot); - else if (xRoot->getRank() > yRoot->getRank()) - yRoot->setParent(xRoot); - else { - yRoot->setParent(xRoot); - xRoot->setRank(xRoot->getRank() + 1); - } -} - template Array regions(const Array &in, af_connectivity connectivity) { in.eval(); - // Create output placeholder Array out = createValueArray(in.dims(), (T)0); out.eval(); - auto func = [=] (Array out, const Array in, af_connectivity connectivity) { - const dim4 in_dims = in.dims(); - const char *in_ptr = in.get(); - T *out_ptr = out.get(); - - // Map labels - typedef typename std::map* > label_map_t; - typedef typename label_map_t::iterator label_map_iterator_t; - - label_map_t lmap; - - // Initial label - T label = (T)1; - - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * in_dims[0] + i; - if (in_ptr[idx] != 0) { - std::vector l; - - // Test neighbors - if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0) - l.push_back(out_ptr[j * in_dims[0] + i-1]); - if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i]); - if (connectivity == AF_CONNECTIVITY_8 && i > 0 && - j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]); - if (connectivity == AF_CONNECTIVITY_8 && - i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0) - l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]); - - if (!l.empty()) { - T minl = l[0]; - for (size_t k = 0; k < l.size(); k++) { - minl = min(l[k], minl); - label_map_iterator_t cur_map = lmap.find(l[k]); - LabelNode *node = cur_map->second; - // Group labels of the same region under a disjoint set - for (size_t m = k+1; m < l.size(); m++) - setUnion(node, lmap.find(l[m])->second); - } - // Set label to smallest neighbor label - out_ptr[idx] = minl; - } - else { - // Insert new label in map - LabelNode *node = new LabelNode(label); - lmap.insert(std::pair* >(label, node)); - out_ptr[idx] = label++; - } - } - } - } - - std::set removed; - - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * (int)in_dims[0] + i; - if (in_ptr[idx] != 0) { - T l = out_ptr[idx]; - label_map_iterator_t cur_map = lmap.find(l); - - if (cur_map != lmap.end()) { - LabelNode* node = cur_map->second; - - LabelNode* node_root = find(node); - out_ptr[idx] = node_root->getMinLabel(); - - // Mark removed labels (those that are part of a region - // that contains a smaller label) - if (node->getMinLabel() < l || node_root->getMinLabel() < l) - removed.insert(l); - if (node->getLabel() > node->getMinLabel()) - removed.insert(node->getLabel()); - } - } - } - } - - // Calculate final neighbors (ensure final labels are sequential) - for (int j = 0; j < (int)in_dims[1]; j++) { - for (int i = 0; i < (int)in_dims[0]; i++) { - int idx = j * (int)in_dims[0] + i; - if (out_ptr[idx] > 0) { - out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx])); - } - } - } - }; - getQueue().enqueue(func, out, in, connectivity); + getQueue().enqueue(kernel::regions, out, in, connectivity); return out; } diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp index 1ad7dad6dc..162039b36c 100644 --- a/src/backend/cpu/reorder.cpp +++ b/src/backend/cpu/reorder.cpp @@ -9,48 +9,13 @@ #include #include -#include -#include #include #include +#include namespace cpu { -template -void reorder_(Array out, const Array in, const af::dim4 oDims, const af::dim4 rdims) -{ - T* outPtr = out.get(); - const T* inPtr = in.get(); - - const af::dim4 ist = in.strides(); - const af::dim4 ost = out.strides(); - - - dim_t ids[4] = {0}; - for(dim_t ow = 0; ow < oDims[3]; ow++) { - const dim_t oW = ow * ost[3]; - ids[rdims[3]] = ow; - for(dim_t oz = 0; oz < oDims[2]; oz++) { - const dim_t oZW = oW + oz * ost[2]; - ids[rdims[2]] = oz; - for(dim_t oy = 0; oy < oDims[1]; oy++) { - const dim_t oYZW = oZW + oy * ost[1]; - ids[rdims[1]] = oy; - for(dim_t ox = 0; ox < oDims[0]; ox++) { - const dim_t oIdx = oYZW + ox; - - ids[rdims[0]] = ox; - const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] + - ids[1] * ist[1] + ids[0]; - - outPtr[oIdx] = inPtr[iIdx]; - } - } - } - } -} - template Array reorder(const Array &in, const af::dim4 &rdims) { @@ -62,7 +27,7 @@ Array reorder(const Array &in, const af::dim4 &rdims) oDims[i] = iDims[rdims[i]]; Array out = createEmptyArray(oDims); - getQueue().enqueue(reorder_, out, in, oDims, rdims); + getQueue().enqueue(kernel::reorder, out, in, oDims, rdims); return out; } diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp index 8fb2edcda6..9a5c85bf1e 100644 --- a/src/backend/cpu/resize.cpp +++ b/src/backend/cpu/resize.cpp @@ -9,174 +9,16 @@ #include #include -#include -#include #include #include #include #include #include +#include namespace cpu { -/** - * noop function for round to avoid compilation - * issues due to lack of this function in C90 based - * compilers, it is only present in C99 and C++11 - * - * This is not a full fledged implementation, this function - * is to be used only for positive numbers, i m using it here - * for calculating dimensions of arrays - */ -dim_t round2int(float value) -{ - return (dim_t)(value+0.5f); -} - -using std::conditional; -using std::is_same; - -template -using wtype_t = typename conditional::value, double, float>::type; - -template -using vtype_t = typename conditional::value, - T, wtype_t - >::type; - -template -struct resize_op -{ - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - return; - } -}; - -template -struct resize_op -{ - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0])); - dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1])); - - if (i_x >= idims[0]) i_x = idims[0] - 1; - if (i_y >= idims[1]) i_y = idims[1] - 1; - - dim_t i_off = i_y * istrides[1] + i_x; - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wost = w * ostrides[3]; - dim_t wist = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; - } - } - } -}; - -template -struct resize_op -{ - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - float f_x = (float)x / (odims[0] / (float)idims[0]); - float f_y = (float)y / (odims[1] / (float)idims[1]); - - dim_t i1_x = floor(f_x); - dim_t i1_y = floor(f_y); - - if (i1_x >= idims[0]) i1_x = idims[0] - 1; - if (i1_y >= idims[1]) i1_y = idims[1] - 1; - - float b = f_x - i1_x; - float a = f_y - i1_y; - - dim_t i2_x = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1); - dim_t i2_y = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1); - - typedef typename dtype_traits::base_type BT; - typedef wtype_t WT; - typedef vtype_t VT; - - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wst = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - dim_t zst = z * istrides[2]; - dim_t channel_off = zst + wst; - VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off]; - VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off]; - VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off]; - VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off]; - - outPtr[o_off + z * ostrides[2] + w * ostrides[3]] = - scalar((1.0f - a) * (1.0f - b)) * p1 + - scalar(( a ) * (1.0f - b)) * p2 + - scalar((1.0f - a) * ( b )) * p3 + - scalar(( a ) * ( b )) * p4; - } - } - } -}; - -template -struct resize_op -{ - void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims, - const af::dim4 &ostrides, const af::dim4 &istrides, - const dim_t x, const dim_t y) - { - // Compute Indices - dim_t i_x = floor((float)x / (odims[0] / (float)idims[0])); - dim_t i_y = floor((float)y / (odims[1] / (float)idims[1])); - - if (i_x >= idims[0]) i_x = idims[0] - 1; - if (i_y >= idims[1]) i_y = idims[1] - 1; - - dim_t i_off = i_y * istrides[1] + i_x; - dim_t o_off = y * ostrides[1] + x; - // Copy values from all channels - for(dim_t w = 0; w < odims[3]; w++) { - dim_t wost = w * ostrides[3]; - dim_t wist = w * istrides[3]; - for(dim_t z = 0; z < odims[2]; z++) { - outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist]; - } - } - } -}; - -template -void resize_(Array out, const Array in) -{ - af::dim4 idims = in.dims(); - af::dim4 odims = out.dims(); - const T *inPtr = in.get(); - T *outPtr = out.get(); - af::dim4 ostrides = out.strides(); - af::dim4 istrides = in.strides(); - - resize_op op; - for(dim_t y = 0; y < odims[1]; y++) { - for(dim_t x = 0; x < odims[0]; x++) { - op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y); - } - } -} - template Array resize(const Array &in, const dim_t odim0, const dim_t odim1, const af_interp_type method) @@ -190,11 +32,11 @@ Array resize(const Array &in, const dim_t odim0, const dim_t odim1, switch(method) { case AF_INTERP_NEAREST: - getQueue().enqueue(resize_, out, in); break; + getQueue().enqueue(kernel::resize, out, in); break; case AF_INTERP_BILINEAR: - getQueue().enqueue(resize_, out, in); break; + getQueue().enqueue(kernel::resize, out, in); break; case AF_INTERP_LOWER: - getQueue().enqueue(resize_, out, in); break; + getQueue().enqueue(kernel::resize, out, in); break; default: break; } return out; diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp index 5687d69c08..e81ee04c80 100644 --- a/src/backend/cpu/rotate.cpp +++ b/src/backend/cpu/rotate.cpp @@ -9,77 +9,14 @@ #include #include -#include -#include -#include #include #include #include "transform_interp.hpp" +#include namespace cpu { -template -void rotate_(Array output, const Array input, const float theta) -{ - const af::dim4 odims = output.dims(); - const af::dim4 idims = input.dims(); - const af::dim4 ostrides = output.strides(); - const af::dim4 istrides = input.strides(); - - const T* in = input.get(); - T* out = output.get(); - dim_t nimages = idims[2]; - - void (*t_fn)(T *, const T *, const float *, const af::dim4 &, - const af::dim4 &, const af::dim4 &, - const dim_t, const dim_t, const dim_t, const dim_t); - - const float c = cos(-theta), s = sin(-theta); - float tx, ty; - { - const float nx = 0.5 * (idims[0] - 1); - const float ny = 0.5 * (idims[1] - 1); - const float mx = 0.5 * (odims[0] - 1); - const float my = 0.5 * (odims[1] - 1); - const float sx = (mx * c + my *-s); - const float sy = (mx * s + my * c); - tx = -(sx - nx); - ty = -(sy - ny); - } - - const float tmat[6] = {std::round( c * 1000) / 1000.0f, - std::round(-s * 1000) / 1000.0f, - std::round(tx * 1000) / 1000.0f, - std::round( s * 1000) / 1000.0f, - std::round( c * 1000) / 1000.0f, - std::round(ty * 1000) / 1000.0f, - }; - - switch(method) { - case AF_INTERP_NEAREST: - t_fn = &transform_n; - break; - case AF_INTERP_BILINEAR: - t_fn = &transform_b; - break; - case AF_INTERP_LOWER: - t_fn = &transform_l; - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } - - - // Do transform for image - for(int yy = 0; yy < (int)odims[1]; yy++) { - for(int xx = 0; xx < (int)odims[0]; xx++) { - t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy); - } - } -} - template Array rotate(const Array &in, const float theta, const af::dim4 &odims, const af_interp_type method) @@ -90,13 +27,13 @@ Array rotate(const Array &in, const float theta, const af::dim4 &odims, switch(method) { case AF_INTERP_NEAREST: - getQueue().enqueue(rotate_, out, in, theta); + getQueue().enqueue(kernel::rotate, out, in, theta); break; case AF_INTERP_BILINEAR: - getQueue().enqueue(rotate_, out, in, theta); + getQueue().enqueue(kernel::rotate, out, in, theta); break; case AF_INTERP_LOWER: - getQueue().enqueue(rotate_, out, in, theta); + getQueue().enqueue(kernel::rotate, out, in, theta); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp index 39157ca9a1..615744fd67 100644 --- a/src/backend/cpu/scan.cpp +++ b/src/backend/cpu/scan.cpp @@ -16,64 +16,13 @@ #include #include #include +#include using af::dim4; namespace cpu { -template -struct scan_dim -{ - void operator()(Array out, dim_t outOffset, - const Array in, dim_t inOffset, - const int dim) const - { - const dim4 odims = out.dims(); - const dim4 ostrides = out.strides(); - const dim4 istrides = in.strides(); - - const int D1 = D - 1; - for (dim_t i = 0; i < odims[D1]; i++) { - scan_dim func; - getQueue().enqueue(func, - out, outOffset + i * ostrides[D1], - in, inOffset + i * istrides[D1], dim); - if (D1 == dim) break; - } - } -}; - -template -struct scan_dim -{ - void operator()(Array output, dim_t outOffset, - const Array input, dim_t inOffset, - const int dim) const - { - const Ti* in = input.get() + inOffset; - To* out= output.get()+ outOffset; - - const dim4 ostrides = output.strides(); - const dim4 istrides = input.strides(); - const dim4 idims = input.dims(); - - dim_t istride = istrides[dim]; - dim_t ostride = ostrides[dim]; - - Transform transform; - // FIXME: Change the name to something better - Binary scan; - - To out_val = scan.init(); - for (dim_t i = 0; i < idims[dim]; i++) { - To in_val = transform(in[i * istride]); - out_val = scan(in_val, out_val); - out[i * ostride] = out_val; - } - } -}; - template Array scan(const Array& in, const int dim) { @@ -84,19 +33,19 @@ Array scan(const Array& in, const int dim) switch (in.ndims()) { case 1: - scan_dim func1; + kernel::scan_dim func1; getQueue().enqueue(func1, out, 0, in, 0, dim); break; case 2: - scan_dim func2; + kernel::scan_dim func2; getQueue().enqueue(func2, out, 0, in, 0, dim); break; case 3: - scan_dim func3; + kernel::scan_dim func3; getQueue().enqueue(func3, out, 0, in, 0, dim); break; case 4: - scan_dim func4; + kernel::scan_dim func4; getQueue().enqueue(func4, out, 0, in, 0, dim); break; } diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp index 4a219eda04..d9a6795a41 100644 --- a/src/backend/cpu/select.cpp +++ b/src/backend/cpu/select.cpp @@ -6,12 +6,13 @@ * The complete license agreement can be obtained at: * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ + #include #include #include -#include #include #include +#include using af::dim4; @@ -25,66 +26,7 @@ void select(Array &out, const Array &cond, const Array &a, const Arr cond.eval(); a.eval(); b.eval(); - auto func = [=] (Array out, const Array cond, const Array a, const Array b) { - dim4 adims = a.dims(); - dim4 astrides = a.strides(); - dim4 bdims = b.dims(); - dim4 bstrides = b.strides(); - - dim4 cdims = cond.dims(); - dim4 cstrides = cond.strides(); - - dim4 odims = out.dims(); - dim4 ostrides = out.strides(); - - bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1], - adims[2] == odims[2], adims[3] == odims[3]}; - - bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1], - bdims[2] == odims[2], bdims[3] == odims[3]}; - - bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1], - cdims[2] == odims[2], cdims[3] == odims[3]}; - - const T *aptr = a.get(); - const T *bptr = b.get(); - T *optr = out.get(); - const char *cptr = cond.get(); - - for (int l = 0; l < odims[3]; l++) { - - int o_off3 = ostrides[3] * l; - int a_off3 = astrides[3] * is_a_same[3] * l; - int b_off3 = bstrides[3] * is_b_same[3] * l; - int c_off3 = cstrides[3] * is_c_same[3] * l; - - for (int k = 0; k < odims[2]; k++) { - - int o_off2 = ostrides[2] * k + o_off3; - int a_off2 = astrides[2] * is_a_same[2] * k + a_off3; - int b_off2 = bstrides[2] * is_b_same[2] * k + b_off3; - int c_off2 = cstrides[2] * is_c_same[2] * k + c_off3; - - for (int j = 0; j < odims[1]; j++) { - - int o_off1 = ostrides[1] * j + o_off2; - int a_off1 = astrides[1] * is_a_same[1] * j + a_off2; - int b_off1 = bstrides[1] * is_b_same[1] * j + b_off2; - int c_off1 = cstrides[1] * is_c_same[1] * j + c_off2; - - for (int i = 0; i < odims[0]; i++) { - - bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1]; - T aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1]; - T bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1]; - T oval = cval ? aval : bval; - optr[o_off1 + i] = oval; - } - } - } - } - }; - getQueue().enqueue(func, out, cond, a, b); + getQueue().enqueue(kernel::select, out, cond, a, b); } template @@ -93,44 +35,7 @@ void select_scalar(Array &out, const Array &cond, const Array &a, co out.eval(); cond.eval(); a.eval(); - auto func = [=] (Array out, const Array cond, const Array a, const double b) { - dim4 astrides = a.strides(); - dim4 cstrides = cond.strides(); - - dim4 odims = out.dims(); - dim4 ostrides = out.strides(); - - const T *aptr = a.get(); - T *optr = out.get(); - const char *cptr = cond.get(); - - for (int l = 0; l < odims[3]; l++) { - - int o_off3 = ostrides[3] * l; - int a_off3 = astrides[3] * l; - int c_off3 = cstrides[3] * l; - - for (int k = 0; k < odims[2]; k++) { - - int o_off2 = ostrides[2] * k + o_off3; - int a_off2 = astrides[2] * k + a_off3; - int c_off2 = cstrides[2] * k + c_off3; - - for (int j = 0; j < odims[1]; j++) { - - int o_off1 = ostrides[1] * j + o_off2; - int a_off1 = astrides[1] * j + a_off2; - int c_off1 = cstrides[1] * j + c_off2; - - for (int i = 0; i < odims[0]; i++) { - - optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b; - } - } - } - } - }; - getQueue().enqueue(func, out, cond, a, b); + getQueue().enqueue(kernel::select_scalar, out, cond, a, b); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp index 766427bff5..eca1e5063f 100644 --- a/src/backend/cpu/shift.cpp +++ b/src/backend/cpu/shift.cpp @@ -9,20 +9,13 @@ #include #include -#include -#include -#include #include #include +#include namespace cpu { -static inline dim_t simple_mod(const dim_t i, const dim_t dim) -{ - return (i < dim) ? i : (i - dim); -} - template Array shift(const Array &in, const int sdims[4]) { @@ -31,48 +24,7 @@ Array shift(const Array &in, const int sdims[4]) Array out = createEmptyArray(in.dims()); const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]); - auto func = [=] (Array out, const Array in, const af::dim4 sdims) { - - T* outPtr = out.get(); - const T* inPtr = in.get(); - - const af::dim4 oDims = out.dims(); - const af::dim4 ist = in.strides(); - const af::dim4 ost = out.strides(); - - int sdims_[4]; - // Need to do this because we are mapping output to input in the kernel - for(int i = 0; i < 4; i++) { - // sdims_[i] will always be positive and always [0, oDims[i]]. - // Negative shifts are converted to position by going the other way round - sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0); - assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]); - } - - for(dim_t ow = 0; ow < oDims[3]; ow++) { - const int oW = ow * ost[3]; - const int iw = simple_mod((ow + sdims_[3]), oDims[3]); - const int iW = iw * ist[3]; - for(dim_t oz = 0; oz < oDims[2]; oz++) { - const int oZW = oW + oz * ost[2]; - const int iz = simple_mod((oz + sdims_[2]), oDims[2]); - const int iZW = iW + iz * ist[2]; - for(dim_t oy = 0; oy < oDims[1]; oy++) { - const int oYZW = oZW + oy * ost[1]; - const int iy = simple_mod((oy + sdims_[1]), oDims[1]); - const int iYZW = iZW + iy * ist[1]; - for(dim_t ox = 0; ox < oDims[0]; ox++) { - const int oIdx = oYZW + ox; - const int ix = simple_mod((ox + sdims_[0]), oDims[0]); - const int iIdx = iYZW + ix; - - outPtr[oIdx] = inPtr[iIdx]; - } - } - } - } - }; - getQueue().enqueue(func, out, in, temp); + getQueue().enqueue(kernel::shift, out, in, temp); return out; } diff --git a/src/backend/cpu/sift.cpp b/src/backend/cpu/sift.cpp index 70bb11d1ae..4b20f8ab49 100644 --- a/src/backend/cpu/sift.cpp +++ b/src/backend/cpu/sift.cpp @@ -22,7 +22,7 @@ #include #ifdef AF_BUILD_SIFT -#include +#include #endif using af::dim4; diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp index ba47ba9fd6..161266d7cf 100644 --- a/src/backend/cpu/sobel.cpp +++ b/src/backend/cpu/sobel.cpp @@ -13,80 +13,15 @@ #include #include #include -#include #include #include +#include using af::dim4; namespace cpu { -template -void derivative(Array output, const Array input) -{ - const dim4 dims = input.dims(); - const dim4 strides = input.strides(); - To* optr = output.get(); - const Ti* iptr = input.get(); - - for(dim_t b3=0; b3=0 && _joff>=0) ? - iptr[_joff*strides[1]+_ioff*strides[0]] : 0; - To SW = (ioff_<(int)dims[0] && _joff>=0) ? - iptr[_joff*strides[1]+ioff_*strides[0]] : 0; - To NE = (_ioff>=0 && joff_<(int)dims[1]) ? - iptr[joff_*strides[1]+_ioff*strides[0]] : 0; - To SE = (ioff_<(int)dims[0] && joff_<(int)dims[1]) ? - iptr[joff_*strides[1]+ioff_*strides[0]] : 0; - - if (isDX) { - To W = _joff>=0 ? - iptr[_joff*strides[1]+ioff*strides[0]] : 0; - - To E = joff_<(int)dims[1] ? - iptr[joff_*strides[1]+ioff*strides[0]] : 0; - - accum = NW+SW - (NE+SE) + 2*(W-E); - } else { - To N = _ioff>=0 ? - iptr[joff*strides[1]+_ioff*strides[0]] : 0; - - To S = ioff_<(int)dims[0] ? - iptr[joff*strides[1]+ioff_*strides[0]] : 0; - - accum = NW+NE - (SW+SE) + 2*(N-S); - } - - optr[joffset+i*strides[0]] = accum; - } - } - - optr += strides[2]; - iptr += strides[2]; - } - optr += strides[3]; - iptr += strides[3]; - } -} - template std::pair< Array, Array > sobelDerivatives(const Array &img, const unsigned &ker_size) @@ -97,8 +32,8 @@ sobelDerivatives(const Array &img, const unsigned &ker_size) Array dx = createEmptyArray(img.dims()); Array dy = createEmptyArray(img.dims()); - getQueue().enqueue(derivative, dx, img); - getQueue().enqueue(derivative, dy, img); + getQueue().enqueue(kernel::derivative, dx, img); + getQueue().enqueue(kernel::derivative, dy, img); return std::make_pair(dx, dy); } diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index cbdb50e987..6a0465cf37 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -11,55 +11,15 @@ #include #include #include -#include -#include #include #include #include #include - -using std::greater; -using std::less; -using std::sort; -using std::function; +#include namespace cpu { -/////////////////////////////////////////////////////////////////////////// -// Kernel Functions -/////////////////////////////////////////////////////////////////////////// - -// Based off of http://stackoverflow.com/a/12399290 -template -void sort0(Array val) -{ - // initialize original index locations - T *val_ptr = val.get(); - - function op = greater(); - if(isAscending) { op = less(); } - - T *comp_ptr = nullptr; - for(dim_t w = 0; w < val.dims()[3]; w++) { - dim_t valW = w * val.strides()[3]; - for(dim_t z = 0; z < val.dims()[2]; z++) { - dim_t valWZ = valW + z * val.strides()[2]; - for(dim_t y = 0; y < val.dims()[1]; y++) { - - dim_t valOffset = valWZ + y * val.strides()[1]; - - comp_ptr = val_ptr + valOffset; - std::sort(comp_ptr, comp_ptr + val.dims()[0], op); - } - } - } - return; -} - -/////////////////////////////////////////////////////////////////////////// -// Wrapper Functions -/////////////////////////////////////////////////////////////////////////// template Array sort(const Array &in, const unsigned dim) { @@ -67,7 +27,7 @@ Array sort(const Array &in, const unsigned dim) Array out = copyArray(in); switch(dim) { - case 0: getQueue().enqueue(sort0, out); break; + case 0: getQueue().enqueue(kernel::sort0, out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } return out; diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index d2ebd4296d..409b82538e 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -9,92 +9,13 @@ #include #include -#include -#include -#include -#include -#include -#include #include #include - -using std::greater; -using std::less; -using std::sort; -using std::function; -using std::queue; -using std::async; +#include namespace cpu { -/////////////////////////////////////////////////////////////////////////// -// Kernel Functions -/////////////////////////////////////////////////////////////////////////// - -template -void sort0_by_key(Array okey, Array oval, Array oidx, - const Array ikey, const Array ival) -{ - function op = greater(); - if(isAscending) { op = less(); } - - // Get pointers and initialize original index locations - uint *oidx_ptr = oidx.get(); - Tk *okey_ptr = okey.get(); - Tv *oval_ptr = oval.get(); - const Tk *ikey_ptr = ikey.get(); - const Tv *ival_ptr = ival.get(); - - std::vector seq_vec(oidx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); - - const Tk *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < ikey.dims()[3]; w++) { - dim_t okeyW = w * okey.strides()[3]; - dim_t ovalW = w * oval.strides()[3]; - dim_t oidxW = w * oidx.strides()[3]; - dim_t ikeyW = w * ikey.strides()[3]; - dim_t ivalW = w * ival.strides()[3]; - - for(dim_t z = 0; z < ikey.dims()[2]; z++) { - dim_t okeyWZ = okeyW + z * okey.strides()[2]; - dim_t ovalWZ = ovalW + z * oval.strides()[2]; - dim_t oidxWZ = oidxW + z * oidx.strides()[2]; - dim_t ikeyWZ = ikeyW + z * ikey.strides()[2]; - dim_t ivalWZ = ivalW + z * ival.strides()[2]; - - for(dim_t y = 0; y < ikey.dims()[1]; y++) { - - dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; - dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; - dim_t oidxOffset = oidxWZ + y * oidx.strides()[1]; - dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1]; - dim_t ivalOffset = ivalWZ + y * ival.strides()[1]; - - uint *ptr = oidx_ptr + oidxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); - - comp_ptr = ikey_ptr + ikeyOffset; - std::stable_sort(ptr, ptr + ikey.dims()[0], comparator); - - for (dim_t i = 0; i < oval.dims()[0]; ++i){ - uint sortIdx = oidx_ptr[oidxOffset + i]; - okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx]; - oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx]; - } - } - } - } - - return; -} - -/////////////////////////////////////////////////////////////////////////// -// Wrapper Functions -/////////////////////////////////////////////////////////////////////////// template void sort_by_key(Array &okey, Array &oval, const Array &ikey, const Array &ival, const uint dim) @@ -108,7 +29,7 @@ void sort_by_key(Array &okey, Array &oval, oidx.eval(); switch(dim) { - case 0: getQueue().enqueue(sort0_by_key, + case 0: getQueue().enqueue(kernel::sort0_by_key, okey, oval, oidx, ikey, ival); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index f9415345ae..ed6afea814 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -10,72 +10,15 @@ #include #include #include -#include -#include #include #include #include #include - -using std::greater; -using std::less; -using std::sort; +#include namespace cpu { -/////////////////////////////////////////////////////////////////////////// -// Kernel Functions -/////////////////////////////////////////////////////////////////////////// -template -void sort0_index(Array &val, Array &idx, const Array &in) -{ - // initialize original index locations - uint *idx_ptr = idx.get(); - T *val_ptr = val.get(); - const T *in_ptr = in.get(); - function op = greater(); - if(isAscending) { op = less(); } - - std::vector seq_vec(idx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); - - const T *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < in.dims()[3]; w++) { - dim_t valW = w * val.strides()[3]; - dim_t idxW = w * idx.strides()[3]; - dim_t inW = w * in.strides()[3]; - for(dim_t z = 0; z < in.dims()[2]; z++) { - dim_t valWZ = valW + z * val.strides()[2]; - dim_t idxWZ = idxW + z * idx.strides()[2]; - dim_t inWZ = inW + z * in.strides()[2]; - for(dim_t y = 0; y < in.dims()[1]; y++) { - - dim_t valOffset = valWZ + y * val.strides()[1]; - dim_t idxOffset = idxWZ + y * idx.strides()[1]; - dim_t inOffset = inWZ + y * in.strides()[1]; - - uint *ptr = idx_ptr + idxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); - - comp_ptr = in_ptr + inOffset; - std::stable_sort(ptr, ptr + in.dims()[0], comparator); - - for (dim_t i = 0; i < val.dims()[0]; ++i){ - val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]]; - } - } - } - } - - return; -} - -/////////////////////////////////////////////////////////////////////////// -// Wrapper Functions -/////////////////////////////////////////////////////////////////////////// template void sort_index(Array &val, Array &idx, const Array &in, const uint dim) { @@ -84,7 +27,7 @@ void sort_index(Array &val, Array &idx, const Array &in, const uint val = createEmptyArray(in.dims()); idx = createEmptyArray(in.dims()); switch(dim) { - case 0: getQueue().enqueue(sort0_index, val, idx, in); break; + case 0: getQueue().enqueue(kernel::sort0_index, val, idx, in); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } } diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp index c278908e40..6e8d0fe5b0 100644 --- a/src/backend/cpu/susan.cpp +++ b/src/backend/cpu/susan.cpp @@ -14,6 +14,7 @@ #include #include #include +#include using af::features; using std::shared_ptr; @@ -21,85 +22,6 @@ using std::shared_ptr; namespace cpu { -template -void susan_responses(Array output, const Array input, - const unsigned idim0, const unsigned idim1, - const int radius, const float t, const float g, - const unsigned border_len) -{ - T* resp_out = output.get(); - const T* in = input.get(); - - const unsigned r = border_len; - const int rSqrd = radius*radius; - - for (unsigned y = r; y < idim1 - r; ++y) { - for (unsigned x = r; x < idim0 - r; ++x) { - const unsigned idx = y * idim0 + x; - T m_0 = in[idx]; - float nM = 0.0f; - - for (int i=-radius; i<=radius; ++i) { - for (int j=-radius; j<=radius; ++j) { - if (i*i + j*j < rSqrd) { - int p = x + i; - int q = y + j; - T m = in[p + idim0 * q]; - float exp_pow = std::pow((m - m_0)/t, 6.0); - float cM = std::exp(-exp_pow); - nM += cM; - } - } - } - - resp_out[idx] = nM < g ? g - nM : T(0); - } - } -} - -template -void non_maximal(Array xcoords, Array ycoords, Array response, - shared_ptr counter, const unsigned idim0, const unsigned idim1, - const Array input, const unsigned border_len, const unsigned max_corners) -{ - float* x_out = xcoords.get(); - float* y_out = ycoords.get(); - float* resp_out = response.get(); - unsigned* count = counter.get(); - const T* resp_in= input.get(); - - // Responses on the border don't have 8-neighbors to compare, discard them - const unsigned r = border_len + 1; - - for (unsigned y = r; y < idim1 - r; y++) { - for (unsigned x = r; x < idim0 - r; x++) { - const T v = resp_in[y * idim0 + x]; - - // Find maximum neighborhood response - T max_v; - max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]); - max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]); - max_v = max(max_v, resp_in[(y-1) * idim0 + x ]); - max_v = max(max_v, resp_in[(y+1) * idim0 + x ]); - max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]); - max_v = max(max_v, resp_in[(y) * idim0 + x+1]); - max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]); - - // Stores corner to {x,y,resp}_out if it's response is maximum compared - // to its 8-neighborhood and greater or equal minimum response - if (v > max_v) { - const unsigned idx = *count; - *count += 1; - if (idx < max_corners) { - x_out[idx] = (float)x; - y_out[idx] = (float)y; - resp_out[idx] = (float)v; - } - } - } - } -} - template unsigned susan(Array &x_out, Array &y_out, Array &resp_out, const Array &in, @@ -118,9 +40,9 @@ unsigned susan(Array &x_out, Array &y_out, Array &resp_out, auto corners_found= std::shared_ptr(memAlloc(1), memFree); corners_found.get()[0] = 0; - getQueue().enqueue(susan_responses, response, in, idims[0], idims[1], + getQueue().enqueue(kernel::susan_responses, response, in, idims[0], idims[1], radius, diff_thr, geom_thr, edge); - getQueue().enqueue(non_maximal, x_corners, y_corners, resp_corners, corners_found, + getQueue().enqueue(kernel::non_maximal, x_corners, y_corners, resp_corners, corners_found, idims[0], idims[1], response, edge, corner_lim); getQueue().sync(); diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp index 4f035450ae..6526917d3a 100644 --- a/src/backend/cpu/tile.cpp +++ b/src/backend/cpu/tile.cpp @@ -9,10 +9,9 @@ #include #include -#include -#include #include #include +#include namespace cpu { @@ -32,40 +31,7 @@ Array tile(const Array &in, const af::dim4 &tileDims) Array out = createEmptyArray(oDims); - auto func = [=] (Array out, const Array in) { - - T* outPtr = out.get(); - const T* inPtr = in.get(); - - const af::dim4 iDims = in.dims(); - const af::dim4 oDims = out.dims(); - const af::dim4 ist = in.strides(); - const af::dim4 ost = out.strides(); - - for(dim_t ow = 0; ow < oDims[3]; ow++) { - const dim_t iw = ow % iDims[3]; - const dim_t iW = iw * ist[3]; - const dim_t oW = ow * ost[3]; - for(dim_t oz = 0; oz < oDims[2]; oz++) { - const dim_t iz = oz % iDims[2]; - const dim_t iZW = iW + iz * ist[2]; - const dim_t oZW = oW + oz * ost[2]; - for(dim_t oy = 0; oy < oDims[1]; oy++) { - const dim_t iy = oy % iDims[1]; - const dim_t iYZW = iZW + iy * ist[1]; - const dim_t oYZW = oZW + oy * ost[1]; - for(dim_t ox = 0; ox < oDims[0]; ox++) { - const dim_t ix = ox % iDims[0]; - const dim_t iMem = iYZW + ix; - const dim_t oMem = oYZW + ox; - outPtr[oMem] = inPtr[iMem]; - } - } - } - } - }; - - getQueue().enqueue(func, out, in); + getQueue().enqueue(kernel::tile, out, in); return out; } diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp index a7287ceea0..fc7145854b 100644 --- a/src/backend/cpu/transform.cpp +++ b/src/backend/cpu/transform.cpp @@ -10,99 +10,14 @@ #include #include #include -#include -#include #include #include #include "transform_interp.hpp" +#include namespace cpu { -template -void calc_affine_inverse(T *txo, const T *txi) -{ - T det = txi[0]*txi[4] - txi[1]*txi[3]; - - txo[0] = txi[4] / det; - txo[1] = txi[3] / det; - txo[3] = txi[1] / det; - txo[4] = txi[0] / det; - - txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; - txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; -} - -template -void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse) -{ - // The way kernel is structured, it expects an inverse - // transform matrix by default. - // If it is an forward transform, then we need its inverse - if(inverse) { - for(int i = 0; i < 6; i++) - tmat[i] = tmat_ptr[i]; - } else { - calc_affine_inverse(tmat, tmat_ptr); - } -} - -template -void transform_(Array output, const Array input, - const Array transform, const bool inverse) -{ - const af::dim4 idims = input.dims(); - const af::dim4 odims = output.dims(); - const af::dim4 istrides = input.strides(); - const af::dim4 ostrides = output.strides(); - - T * out = output.get(); - const T * in = input.get(); - const float* tf = transform.get(); - - dim_t nimages = idims[2]; - // Multiplied in src/backend/transform.cpp - dim_t ntransforms = odims[2] / idims[2]; - - void (*t_fn)(T *, const T *, const float *, const af::dim4 &, - const af::dim4 &, const af::dim4 &, - const dim_t, const dim_t, const dim_t, const dim_t); - - switch(method) { - case AF_INTERP_NEAREST: - t_fn = &transform_n; - break; - case AF_INTERP_BILINEAR: - t_fn = &transform_b; - break; - case AF_INTERP_LOWER: - t_fn = &transform_l; - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; - } - - - // For each transform channel - for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) { - // Compute inverse if required - const float *tmat_ptr = tf + t_idx * 6; - float tmat[6]; - calc_affine_inverse(tmat, tmat_ptr, inverse); - - // Offset for output pointer - dim_t o_offset = t_idx * nimages * ostrides[2]; - - // Do transform for image - for(int yy = 0; yy < (int)odims[1]; yy++) { - for(int xx = 0; xx < (int)odims[0]; xx++) { - t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy); - } - } - } -} - template Array transform(const Array &in, const Array &transform, const af::dim4 &odims, const af_interp_type method, const bool inverse) @@ -114,13 +29,13 @@ Array transform(const Array &in, const Array &transform, const af:: switch(method) { case AF_INTERP_NEAREST : - getQueue().enqueue(transform_, out, in, transform, inverse); + getQueue().enqueue(kernel::transform, out, in, transform, inverse); break; case AF_INTERP_BILINEAR: - getQueue().enqueue(transform_, out, in, transform, inverse); + getQueue().enqueue(kernel::transform, out, in, transform, inverse); break; case AF_INTERP_LOWER : - getQueue().enqueue(transform_, out, in, transform, inverse); + getQueue().enqueue(kernel::transform, out, in, transform, inverse); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } diff --git a/src/backend/cpu/transform_interp.hpp b/src/backend/cpu/transform_interp.hpp index 5ad47507b2..d90ae38f71 100644 --- a/src/backend/cpu/transform_interp.hpp +++ b/src/backend/cpu/transform_interp.hpp @@ -7,6 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once +#include #include #include diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp index 7e7eec1747..32663e1f94 100644 --- a/src/backend/cpu/transpose.cpp +++ b/src/backend/cpu/transpose.cpp @@ -14,7 +14,7 @@ #include #include #include - +#include #include #include @@ -23,74 +23,6 @@ using af::dim4; namespace cpu { -static inline unsigned getIdx(const dim4 &strides, - int i, int j = 0, int k = 0, int l = 0) -{ - return (l * strides[3] + - k * strides[2] + - j * strides[1] + - i ); -} - -template -T getConjugate(const T &in) -{ - // For non-complex types return same - return in; -} - -template<> -cfloat getConjugate(const cfloat &in) -{ - return std::conj(in); -} - -template<> -cdouble getConjugate(const cdouble &in) -{ - return std::conj(in); -} - -template -void transpose_(Array output, const Array input) -{ - const dim4 odims = output.dims(); - const dim4 ostrides = output.strides(); - const dim4 istrides = input.strides(); - - T * out = output.get(); - T const * const in = input.get(); - - for (dim_t l = 0; l < odims[3]; ++l) { - for (dim_t k = 0; k < odims[2]; ++k) { - // Outermost loop handles batch mode - // if input has no data along third dimension - // this loop runs only once - for (dim_t j = 0; j < odims[1]; ++j) { - for (dim_t i = 0; i < odims[0]; ++i) { - // calculate array indices based on offsets and strides - // the helper getIdx takes care of indices - const dim_t inIdx = getIdx(istrides,j,i,k,l); - const dim_t outIdx = getIdx(ostrides,i,j,k,l); - if(conjugate) - out[outIdx] = getConjugate(in[inIdx]); - else - out[outIdx] = in[inIdx]; - } - } - // outData and inData pointers doesn't need to be - // offset as the getIdx function is taking care - // of the batch parameter - } - } -} - -template -void transpose_(Array out, const Array in, const bool conjugate) -{ - return (conjugate ? transpose_(out, in) : transpose_(out, in)); -} - template Array transpose(const Array &in, const bool conjugate) { @@ -101,57 +33,16 @@ Array transpose(const Array &in, const bool conjugate) // create an array with first two dimensions swapped Array out = createEmptyArray(outDims); - getQueue().enqueue(transpose_, out, in, conjugate); + getQueue().enqueue(kernel::transpose, out, in, conjugate); return out; } -template -void transpose_inplace(Array input) -{ - const dim4 idims = input.dims(); - const dim4 istrides = input.strides(); - - T * in = input.get(); - - for (dim_t l = 0; l < idims[3]; ++l) { - for (dim_t k = 0; k < idims[2]; ++k) { - // Outermost loop handles batch mode - // if input has no data along third dimension - // this loop runs only once - // - // Run only bottom triangle. std::swap swaps with upper triangle - for (dim_t j = 0; j < idims[1]; ++j) { - for (dim_t i = j + 1; i < idims[0]; ++i) { - // calculate array indices based on offsets and strides - // the helper getIdx takes care of indices - const dim_t iIdx = getIdx(istrides,j,i,k,l); - const dim_t oIdx = getIdx(istrides,i,j,k,l); - if(conjugate) { - in[iIdx] = getConjugate(in[iIdx]); - in[oIdx] = getConjugate(in[oIdx]); - std::swap(in[iIdx], in[oIdx]); - } - else { - std::swap(in[iIdx], in[oIdx]); - } - } - } - } - } -} - -template -void transpose_inplace_(Array in, const bool conjugate) -{ - return (conjugate ? transpose_inplace(in) : transpose_inplace(in)); -} - template void transpose_inplace(Array &in, const bool conjugate) { in.eval(); - getQueue().enqueue(transpose_inplace_, in, conjugate); + getQueue().enqueue(kernel::transpose_inplace, in, conjugate); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp index 13bee164eb..2a9553c83a 100644 --- a/src/backend/cpu/triangle.cpp +++ b/src/backend/cpu/triangle.cpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace cpu { @@ -21,46 +22,7 @@ namespace cpu template void triangle(Array &out, const Array &in) { - auto func = [=] (Array out, const Array in) { - T *o = out.get(); - const T *i = in.get(); - - dim4 odm = out.dims(); - - dim4 ost = out.strides(); - dim4 ist = in.strides(); - - for(dim_t ow = 0; ow < odm[3]; ow++) { - const dim_t oW = ow * ost[3]; - const dim_t iW = ow * ist[3]; - - for(dim_t oz = 0; oz < odm[2]; oz++) { - const dim_t oZW = oW + oz * ost[2]; - const dim_t iZW = iW + oz * ist[2]; - - for(dim_t oy = 0; oy < odm[1]; oy++) { - const dim_t oYZW = oZW + oy * ost[1]; - const dim_t iYZW = iZW + oy * ist[1]; - - for(dim_t ox = 0; ox < odm[0]; ox++) { - const dim_t oMem = oYZW + ox; - const dim_t iMem = iYZW + ox; - - bool cond = is_upper ? (oy >= ox) : (oy <= ox); - bool do_unit_diag = (is_unit_diag && ox == oy); - if(cond) { - o[oMem] = do_unit_diag ? scalar(1) : i[iMem]; - } else { - o[oMem] = scalar(0); - } - - } - } - } - } - }; - - getQueue().enqueue(func, out, in); + getQueue().enqueue(kernel::triangle, out, in); } template diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp index 41423c746c..1aa37a4762 100644 --- a/src/backend/cpu/unwrap.cpp +++ b/src/backend/cpu/unwrap.cpp @@ -9,76 +9,15 @@ #include #include -#include -#include #include #include #include #include +#include namespace cpu { -template -void unwrap_dim(Array out, const Array in, const dim_t wx, const dim_t wy, - const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) -{ - const T *inPtr = in.get(); - T *outPtr = out.get(); - - af::dim4 idims = in.dims(); - af::dim4 odims = out.dims(); - af::dim4 istrides = in.strides(); - af::dim4 ostrides = out.strides(); - - dim_t nx = (idims[0] + 2 * px - wx) / sx + 1; - - for(dim_t w = 0; w < odims[3]; w++) { - for(dim_t z = 0; z < odims[2]; z++) { - - dim_t cOut = w * ostrides[3] + z * ostrides[2]; - dim_t cIn = w * istrides[3] + z * istrides[2]; - const T* iptr = inPtr + cIn; - T* optr_= outPtr + cOut; - - for(dim_t col = 0; col < odims[d]; col++) { - // Offset output ptr - T* optr = optr_ + col * ostrides[d]; - - // Calculate input window index - dim_t winy = (col / nx); - dim_t winx = (col % nx); - - dim_t startx = winx * sx; - dim_t starty = winy * sy; - - dim_t spx = startx - px; - dim_t spy = starty - py; - - // Short cut condition ensuring all values within input dimensions - bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]); - - for(dim_t y = 0; y < wy; y++) { - for(dim_t x = 0; x < wx; x++) { - dim_t xpad = spx + x; - dim_t ypad = spy + y; - - dim_t oloc = (y * wx + x); - if (d == 0) oloc *= ostrides[1]; - - if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) { - dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]); - optr[oloc] = iptr[iloc]; - } else { - optr[oloc] = scalar(0.0); - } - } - } - } - } - } -} - template Array unwrap(const Array &in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column) @@ -98,9 +37,9 @@ Array unwrap(const Array &in, const dim_t wx, const dim_t wy, Array outArray = createEmptyArray(odims); if (is_column) { - getQueue().enqueue(unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); + getQueue().enqueue(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); } else { - getQueue().enqueue(unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); + getQueue().enqueue(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); } return outArray; diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp index 3ff54de640..07487e0d68 100644 --- a/src/backend/cpu/wrap.cpp +++ b/src/backend/cpu/wrap.cpp @@ -9,75 +9,15 @@ #include #include -#include -#include #include #include #include #include +#include namespace cpu { -template -void wrap_dim(Array out, const Array in, const dim_t wx, const dim_t wy, - const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) -{ - const T *inPtr = in.get(); - T *outPtr = out.get(); - - af::dim4 idims = in.dims(); - af::dim4 odims = out.dims(); - af::dim4 istrides = in.strides(); - af::dim4 ostrides = out.strides(); - - dim_t nx = (odims[0] + 2 * px - wx) / sx + 1; - - for(dim_t w = 0; w < idims[3]; w++) { - for(dim_t z = 0; z < idims[2]; z++) { - - dim_t cIn = w * istrides[3] + z * istrides[2]; - dim_t cOut = w * ostrides[3] + z * ostrides[2]; - const T* iptr_ = inPtr + cIn; - T* optr= outPtr + cOut; - - for(dim_t col = 0; col < idims[d]; col++) { - // Offset output ptr - const T* iptr = iptr_ + col * istrides[d]; - - // Calculate input window index - dim_t winy = (col / nx); - dim_t winx = (col % nx); - - dim_t startx = winx * sx; - dim_t starty = winy * sy; - - dim_t spx = startx - px; - dim_t spy = starty - py; - - // Short cut condition ensuring all values within input dimensions - bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]); - - for(dim_t y = 0; y < wy; y++) { - for(dim_t x = 0; x < wx; x++) { - dim_t xpad = spx + x; - dim_t ypad = spy + y; - - dim_t iloc = (y * wx + x); - if (d == 0) iloc *= istrides[1]; - - if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) { - dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]); - // FIXME: When using threads, atomize this - optr[oloc] += iptr[iloc]; - } - } - } - } - } - } -} - template Array wrap(const Array &in, const dim_t ox, const dim_t oy, @@ -94,9 +34,9 @@ Array wrap(const Array &in, in.eval(); if (is_column) { - getQueue().enqueue(wrap_dim, out, in, wx, wy, sx, sy, px, py); + getQueue().enqueue(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); } else { - getQueue().enqueue(wrap_dim, out, in, wx, wy, sx, sy, px, py); + getQueue().enqueue(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); } return out; From 1313f984734b6cee4d254de34d40abb5682fa42c Mon Sep 17 00:00:00 2001 From: pradeep Date: Sun, 20 Dec 2015 11:17:08 -0500 Subject: [PATCH 088/288] Fixed the bug in cpu ireduce kernel function --- src/backend/cpu/kernel/ireduce.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/cpu/kernel/ireduce.hpp b/src/backend/cpu/kernel/ireduce.hpp index 1f5a51da62..848885515b 100644 --- a/src/backend/cpu/kernel/ireduce.hpp +++ b/src/backend/cpu/kernel/ireduce.hpp @@ -94,13 +94,13 @@ struct ireduce_dim uint * loc = locArray.get(); dim_t stride = istrides[dim]; - MinMaxOp Op(in[0], 0); + MinMaxOp Op(in[inOffset], 0); for (dim_t i = 0; i < idims[dim]; i++) { Op(in[inOffset + i * stride], i); } - *(out+outOffset) = Op.m_val; - *(loc+outOffset) = Op.m_idx; + out[outOffset] = Op.m_val; + loc[outOffset] = Op.m_idx; } }; From 5c0160863c9dd64c1733497fdce24dfdef823bc7 Mon Sep 17 00:00:00 2001 From: syurkevi Date: Mon, 21 Dec 2015 13:24:01 -0500 Subject: [PATCH 089/288] remove state globals --- examples/graphics/gravity_sim.cpp | 79 +++++++++++++++---------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/examples/graphics/gravity_sim.cpp b/examples/graphics/gravity_sim.cpp index 94d321ba7c..25ca0da4d7 100644 --- a/examples/graphics/gravity_sim.cpp +++ b/examples/graphics/gravity_sim.cpp @@ -17,43 +17,36 @@ using namespace std; static const int width = 512, height = 512; static const int pixels_per_unit = 20; -af::array p_x; -af::array p_y; -af::array vels_x; -af::array vels_y; -af::array forces_x; -af::array forces_y; - -void simulate(float dt){ - p_x += vels_x * pixels_per_unit * dt; - p_y += vels_y * pixels_per_unit * dt; +void simulate(af::array *pos, af::array *vels, af::array *forces, float dt){ + pos[0] += vels[0] * pixels_per_unit * dt; + pos[1] += vels[1] * pixels_per_unit * dt; //calculate distance to center - af::array diff_x = p_x - width/2; - af::array diff_y = p_y - height/2; + af::array diff_x = pos[0] - width/2; + af::array diff_y = pos[1] - height/2; af::array dist = sqrt( diff_x*diff_x + diff_y*diff_y ); //calculate normalised force vectors - forces_x = -1 * diff_x / dist; - forces_y = -1 * diff_y / dist; + forces[0] = -1 * diff_x / dist; + forces[1] = -1 * diff_y / dist; //update force scaled to time and magnitude constant - forces_x *= pixels_per_unit * dt; - forces_y *= pixels_per_unit * dt; + forces[0] *= pixels_per_unit * dt; + forces[1] *= pixels_per_unit * dt; //dampening - vels_x *= 1 - (0.005*dt); - vels_y *= 1 - (0.005*dt); + vels[0] *= 1 - (0.005*dt); + vels[1] *= 1 - (0.005*dt); //update velocities from forces - vels_x += forces_x; - vels_y += forces_y; + vels[0] += forces[0]; + vels[1] += forces[1]; } -void collisions(){ +void collisions(af::array *pos, af::array *vels){ //clamp particles inside screen border - af::array projected_px = min(width, max(0, p_x)); - af::array projected_py = min(height - 1, max(0, p_y)); + af::array projected_px = min(width, max(0, pos[0])); + af::array projected_py = min(height - 1, max(0, pos[1])); //calculate distance to center af::array diff_x = projected_px - width/2; @@ -64,15 +57,15 @@ void collisions(){ const int radius = 50; const float elastic_constant = 0.91f; if(sum(dist 0) { - vels_x(dist Date: Mon, 21 Dec 2015 13:43:09 -0500 Subject: [PATCH 090/288] remove windows pause ifdef --- examples/graphics/gravity_sim.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/examples/graphics/gravity_sim.cpp b/examples/graphics/gravity_sim.cpp index 25ca0da4d7..3fc19d8c65 100644 --- a/examples/graphics/gravity_sim.cpp +++ b/examples/graphics/gravity_sim.cpp @@ -135,13 +135,6 @@ int main(int argc, char *argv[]) throw; } - #ifdef WIN32 // pause in Windows - if (!(argc == 2 && argv[1][0] == '-')) { - printf("hit [enter]..."); - fflush(stdout); - getchar(); - } - #endif return 0; } From b684b06418efe9d2c0f0f3d4be8389c4b2d7789a Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 21 Dec 2015 18:19:39 -0500 Subject: [PATCH 091/288] Fixed orb async cpu fn It was a bug in upstream function cpu::fast --- src/backend/cpu/fast.cpp | 2 +- src/backend/cpu/orb.cpp | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp index fe02387102..42607d888f 100644 --- a/src/backend/cpu/fast.cpp +++ b/src/backend/cpu/fast.cpp @@ -30,7 +30,6 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, const unsigned edge) { in.eval(); - getQueue().sync(); dim4 in_dims = in.dims(); const unsigned max_feat = ceil(in.elements() * feature_ratio); @@ -43,6 +42,7 @@ unsigned fast(Array &x_out, Array &y_out, Array &score_out, V = createValueArray(V_dims, (float)0); V.eval(); } + getQueue().sync(); // Arrays containing all features detected before non-maximal suppression. dim4 max_feat_dims(max_feat); diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp index 00fe8203d4..5dd9326134 100644 --- a/src/backend/cpu/orb.cpp +++ b/src/backend/cpu/orb.cpp @@ -103,12 +103,13 @@ unsigned orb(Array &x, Array &y, ldims[1] = round(idims[1] / lvl_scl); lvl_img = resize(prev_img, ldims[0], ldims[1], AF_INTERP_BILINEAR); - lvl_img.eval(); - getQueue().sync(); prev_img = lvl_img; prev_ldims = lvl_img.dims(); } + prev_img.eval(); + lvl_img.eval(); + getQueue().sync(); Array x_feat = createEmptyArray(dim4()); @@ -125,10 +126,6 @@ unsigned orb(Array &x, Array &y, unsigned lvl_feat = fast(x_feat, y_feat, score_feat, lvl_img, fast_thr, 9, 1, 0.15f, edge); - x_feat.eval(); - y_feat.eval(); - score_feat.eval(); - getQueue().sync(); if (lvl_feat == 0) { continue; @@ -164,8 +161,6 @@ unsigned orb(Array &x, Array &y, Array harris_idx = createEmptyArray(af::dim4()); sort_index(harris_sorted, harris_idx, score_harris, 0); - harris_sorted.eval(); - harris_idx.eval(); getQueue().sync(); usable_feat = std::min(usable_feat, lvl_best[i]); @@ -203,6 +198,7 @@ unsigned orb(Array &x, Array &y, h_gauss = memAlloc(gauss_dims[0]); gaussian1D(h_gauss, gauss_dims[0], 2.f); gauss_filter = createDeviceDataArray(gauss_dims, h_gauss); + gauss_filter.eval(); } // Filter level image with Gaussian kernel to reduce noise sensitivity From a0f17b6ba7adedf0f7ee1093bf8217661fe77679 Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 28 Dec 2015 16:29:49 -0500 Subject: [PATCH 092/288] cmake fix to check for threads submodule --- src/backend/cpu/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index 62f0b3a55e..c2b4e97cd2 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -47,6 +47,18 @@ IF(NOT UNIX) ADD_DEFINITIONS(-DAFDLL) ENDIF() +SET(THREADS_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/threads") +IF(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}") + # threads submodule has been initialized + # Nothing to do +ELSE(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}") + MESSAGE(STATUS "threads submodule unavailable. Updating submodules.") + EXECUTE_PROCESS( + COMMAND git submodule update --init --recursive + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ) +ENDIF(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}") + INCLUDE_DIRECTORIES( ${CMAKE_INCLUDE_PATH} "${CMAKE_SOURCE_DIR}/src/backend/cpu" From c539f1d5ab0fbb5b609f30df858fbb8270e1df99 Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 28 Dec 2015 16:40:25 -0500 Subject: [PATCH 093/288] moved fft cpu fns implementations to kernel namespace --- src/backend/cpu/fft.cpp | 188 ++------------------------------ src/backend/cpu/kernel/fft.hpp | 192 +++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+), 180 deletions(-) create mode 100644 src/backend/cpu/kernel/fft.hpp diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp index e522954cfe..2edced2219 100644 --- a/src/backend/cpu/fft.cpp +++ b/src/backend/cpu/fft.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -24,143 +23,11 @@ using af::dim4; namespace cpu { -template -void computeDims(int rdims[rank], const dim4 &idims) -{ - for (int i = 0; i < rank; i++) { - rdims[i] = idims[(rank -1) - i]; - } -} - -template -struct fftw_transform; - -#define TRANSFORM(PRE, TY) \ - template<> \ - struct fftw_transform \ - { \ - typedef PRE##_plan plan_t; \ - typedef PRE##_complex ctype_t; \ - \ - template \ - plan_t create(Args... args) \ - { return PRE##_plan_many_dft(args...); } \ - void execute(plan_t plan) { return PRE##_execute(plan); } \ - void destroy(plan_t plan) { return PRE##_destroy_plan(plan); } \ - }; \ - - -TRANSFORM(fftwf, cfloat) -TRANSFORM(fftw, cdouble) - -template -void fft_inplace_(Array in) -{ - int t_dims[rank]; - int in_embed[rank]; - - const dim4 idims = in.dims(); - - computeDims(t_dims , idims); - computeDims(in_embed , in.getDataDims()); - - const dim4 istrides = in.strides(); - - typedef typename fftw_transform::ctype_t ctype_t; - typename fftw_transform::plan_t plan; - - fftw_transform transform; - - int batch = 1; - for (int i = rank; i < 4; i++) { - batch *= idims[i]; - } - - plan = transform.create(rank, - t_dims, - (int)batch, - (ctype_t *)in.get(), - in_embed, (int)istrides[0], - (int)istrides[rank], - (ctype_t *)in.get(), - in_embed, (int)istrides[0], - (int)istrides[rank], - direction ? FFTW_FORWARD : FFTW_BACKWARD, - FFTW_ESTIMATE); - - transform.execute(plan); - transform.destroy(plan); -} - template void fft_inplace(Array &in) { in.eval(); - getQueue().enqueue(fft_inplace_, in); -} - -template -struct fftw_real_transform; - -#define TRANSFORM_REAL(PRE, To, Ti, POST) \ - template<> \ - struct fftw_real_transform \ - { \ - typedef PRE##_plan plan_t; \ - typedef PRE##_complex ctype_t; \ - \ - template \ - plan_t create(Args... args) \ - { return PRE##_plan_many_dft_##POST(args...); } \ - void execute(plan_t plan) { return PRE##_execute(plan); } \ - void destroy(plan_t plan) { return PRE##_destroy_plan(plan); } \ - }; \ - - -TRANSFORM_REAL(fftwf, cfloat , float , r2c) -TRANSFORM_REAL(fftw , cdouble, double, r2c) -TRANSFORM_REAL(fftwf, float , cfloat , c2r) -TRANSFORM_REAL(fftw , double, cdouble, c2r) - -template -void fft_r2c_(Array out, const Array in) -{ - dim4 idims = in.dims(); - - int t_dims[rank]; - int in_embed[rank]; - int out_embed[rank]; - - computeDims(t_dims , idims); - computeDims(in_embed , in.getDataDims()); - computeDims(out_embed , out.getDataDims()); - - const dim4 istrides = in.strides(); - const dim4 ostrides = out.strides(); - - typedef typename fftw_real_transform::ctype_t ctype_t; - typename fftw_real_transform::plan_t plan; - - fftw_real_transform transform; - - int batch = 1; - for (int i = rank; i < 4; i++) { - batch *= idims[i]; - } - - plan = transform.create(rank, - t_dims, - (int)batch, - (Tr *)in.get(), - in_embed, (int)istrides[0], - (int)istrides[rank], - (ctype_t *)out.get(), - out_embed, (int)ostrides[0], - (int)ostrides[rank], - FFTW_ESTIMATE); - - transform.execute(plan); - transform.destroy(plan); + getQueue().enqueue(kernel::fft_inplace, in); } template @@ -172,57 +39,18 @@ Array fft_r2c(const Array &in) odims[0] = odims[0] / 2 + 1; Array out = createEmptyArray(odims); - getQueue().enqueue(fft_r2c_, out, in); + getQueue().enqueue(kernel::fft_r2c, out, in); return out; } -template -void fft_c2r_(Array out, const Array in, const dim4 odims) -{ - int t_dims[rank]; - int in_embed[rank]; - int out_embed[rank]; - - computeDims(t_dims , odims); - computeDims(in_embed , in.getDataDims()); - computeDims(out_embed , out.getDataDims()); - - const dim4 istrides = in.strides(); - const dim4 ostrides = out.strides(); - - typedef typename fftw_real_transform::ctype_t ctype_t; - typename fftw_real_transform::plan_t plan; - - fftw_real_transform transform; - - int batch = 1; - for (int i = rank; i < 4; i++) { - batch *= odims[i]; - } - - plan = transform.create(rank, - t_dims, - (int)batch, - (ctype_t *)in.get(), - in_embed, (int)istrides[0], - (int)istrides[rank], - (Tr *)out.get(), - out_embed, (int)ostrides[0], - (int)ostrides[rank], - FFTW_ESTIMATE); - - transform.execute(plan); - transform.destroy(plan); -} - template Array fft_c2r(const Array &in, const dim4 &odims) { in.eval(); Array out = createEmptyArray(odims); - getQueue().enqueue(fft_c2r_, out, in, odims); + getQueue().enqueue(kernel::fft_c2r, out, in, odims); return out; } @@ -235,8 +63,8 @@ Array fft_c2r(const Array &in, const dim4 &odims) template void fft_inplace(Array &in); \ template void fft_inplace(Array &in); - INSTANTIATE(cfloat ) - INSTANTIATE(cdouble) +INSTANTIATE(cfloat ) +INSTANTIATE(cdouble) #define INSTANTIATE_REAL(Tr, Tc) \ template Array fft_r2c(const Array &in); \ @@ -246,7 +74,7 @@ Array fft_c2r(const Array &in, const dim4 &odims) template Array fft_c2r(const Array &in, const dim4 &odims); \ template Array fft_c2r(const Array &in, const dim4 &odims); \ - INSTANTIATE_REAL(float , cfloat ) - INSTANTIATE_REAL(double, cdouble) +INSTANTIATE_REAL(float , cfloat ) +INSTANTIATE_REAL(double, cdouble) } diff --git a/src/backend/cpu/kernel/fft.hpp b/src/backend/cpu/kernel/fft.hpp new file mode 100644 index 0000000000..906c8ef5f5 --- /dev/null +++ b/src/backend/cpu/kernel/fft.hpp @@ -0,0 +1,192 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void computeDims(int rdims[rank], const af::dim4 &idims) +{ + for (int i = 0; i < rank; i++) { + rdims[i] = idims[(rank -1) - i]; + } +} + +template +struct fftw_transform; + +#define TRANSFORM(PRE, TY) \ + template<> \ + struct fftw_transform \ + { \ + typedef PRE##_plan plan_t; \ + typedef PRE##_complex ctype_t; \ + \ + template \ + plan_t create(Args... args) \ + { return PRE##_plan_many_dft(args...); } \ + void execute(plan_t plan) { return PRE##_execute(plan); } \ + void destroy(plan_t plan) { return PRE##_destroy_plan(plan); } \ + }; \ + + +TRANSFORM(fftwf, cfloat) +TRANSFORM(fftw, cdouble) + +template +struct fftw_real_transform; + +#define TRANSFORM_REAL(PRE, To, Ti, POST) \ + template<> \ + struct fftw_real_transform \ + { \ + typedef PRE##_plan plan_t; \ + typedef PRE##_complex ctype_t; \ + \ + template \ + plan_t create(Args... args) \ + { return PRE##_plan_many_dft_##POST(args...); } \ + void execute(plan_t plan) { return PRE##_execute(plan); } \ + void destroy(plan_t plan) { return PRE##_destroy_plan(plan); } \ + }; \ + + +TRANSFORM_REAL(fftwf, cfloat , float , r2c) +TRANSFORM_REAL(fftw , cdouble, double, r2c) +TRANSFORM_REAL(fftwf, float , cfloat , c2r) +TRANSFORM_REAL(fftw , double, cdouble, c2r) + + +template +void fft_inplace(Array in) +{ + int t_dims[rank]; + int in_embed[rank]; + + const af::dim4 idims = in.dims(); + + computeDims(t_dims , idims); + computeDims(in_embed , in.getDataDims()); + + const af::dim4 istrides = in.strides(); + + typedef typename fftw_transform::ctype_t ctype_t; + typename fftw_transform::plan_t plan; + + fftw_transform transform; + + int batch = 1; + for (int i = rank; i < 4; i++) { + batch *= idims[i]; + } + + plan = transform.create(rank, + t_dims, + (int)batch, + (ctype_t *)in.get(), + in_embed, (int)istrides[0], + (int)istrides[rank], + (ctype_t *)in.get(), + in_embed, (int)istrides[0], + (int)istrides[rank], + direction ? FFTW_FORWARD : FFTW_BACKWARD, + FFTW_ESTIMATE); + + transform.execute(plan); + transform.destroy(plan); +} + +template +void fft_r2c(Array out, const Array in) +{ + af::dim4 idims = in.dims(); + + int t_dims[rank]; + int in_embed[rank]; + int out_embed[rank]; + + computeDims(t_dims , idims); + computeDims(in_embed , in.getDataDims()); + computeDims(out_embed , out.getDataDims()); + + const af::dim4 istrides = in.strides(); + const af::dim4 ostrides = out.strides(); + + typedef typename fftw_real_transform::ctype_t ctype_t; + typename fftw_real_transform::plan_t plan; + + fftw_real_transform transform; + + int batch = 1; + for (int i = rank; i < 4; i++) { + batch *= idims[i]; + } + + plan = transform.create(rank, + t_dims, + (int)batch, + (Tr *)in.get(), + in_embed, (int)istrides[0], + (int)istrides[rank], + (ctype_t *)out.get(), + out_embed, (int)ostrides[0], + (int)ostrides[rank], + FFTW_ESTIMATE); + + transform.execute(plan); + transform.destroy(plan); +} + +template +void fft_c2r(Array out, const Array in, const af::dim4 odims) +{ + int t_dims[rank]; + int in_embed[rank]; + int out_embed[rank]; + + computeDims(t_dims , odims); + computeDims(in_embed , in.getDataDims()); + computeDims(out_embed , out.getDataDims()); + + const af::dim4 istrides = in.strides(); + const af::dim4 ostrides = out.strides(); + + typedef typename fftw_real_transform::ctype_t ctype_t; + typename fftw_real_transform::plan_t plan; + + fftw_real_transform transform; + + int batch = 1; + for (int i = rank; i < 4; i++) { + batch *= odims[i]; + } + + plan = transform.create(rank, + t_dims, + (int)batch, + (ctype_t *)in.get(), + in_embed, (int)istrides[0], + (int)istrides[rank], + (Tr *)out.get(), + out_embed, (int)ostrides[0], + (int)ostrides[rank], + FFTW_ESTIMATE); + + transform.execute(plan); + transform.destroy(plan); +} + +} +} From 483121596dd76d4d9c7227d2057b49863d073275 Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 28 Dec 2015 16:47:42 -0500 Subject: [PATCH 094/288] moved dot cpu implementation to kernel namespace --- src/backend/cpu/blas.cpp | 47 ++++++++-------------------------- src/backend/cpu/kernel/dot.hpp | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 36 deletions(-) create mode 100644 src/backend/cpu/kernel/dot.hpp diff --git a/src/backend/cpu/blas.cpp b/src/backend/cpu/blas.cpp index 26ec8b488b..d6f5dee203 100644 --- a/src/backend/cpu/blas.cpp +++ b/src/backend/cpu/blas.cpp @@ -11,20 +11,20 @@ #include #include #include -#include #include +#include #include #include namespace cpu { - using std::add_const; - using std::add_pointer; - using std::enable_if; - using std::is_floating_point; - using std::remove_const; - using std::conditional; +using std::add_const; +using std::add_pointer; +using std::enable_if; +using std::is_floating_point; +using std::remove_const; +using std::conditional; // Some implementations of BLAS require void* for complex pointers while others use float*/double* // @@ -199,31 +199,6 @@ Array matmul(const Array &lhs, const Array &rhs, return out; } -template T -conj(T x) { return x; } - -template<> cfloat conj (cfloat c) { return std::conj(c); } -template<> cdouble conj(cdouble c) { return std::conj(c); } - -template -void dot_(Array output, const Array &lhs, const Array &rhs, - af_mat_prop optLhs, af_mat_prop optRhs) -{ - int N = lhs.dims()[0]; - - T out = 0; - const T *pL = lhs.get(); - const T *pR = rhs.get(); - - for(int i = 0; i < N; i++) - out += (conjugate ? cpu::conj(pL[i]) : pL[i]) * pR[i]; - - if(both_conjugate) out = cpu::conj(out); - - *output.get() = out; - -} - template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) @@ -233,13 +208,13 @@ Array dot(const Array &lhs, const Array &rhs, Array out = createEmptyArray(af::dim4(1)); if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - getQueue().enqueue(dot_, out, lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot, out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - getQueue().enqueue(dot_,out, lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot,out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - getQueue().enqueue(dot_,out, rhs, lhs, optRhs, optLhs); + getQueue().enqueue(kernel::dot,out, rhs, lhs, optRhs, optLhs); } else { - getQueue().enqueue(dot_,out, lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot,out, lhs, rhs, optLhs, optRhs); } return out; } diff --git a/src/backend/cpu/kernel/dot.hpp b/src/backend/cpu/kernel/dot.hpp new file mode 100644 index 0000000000..ef518413c7 --- /dev/null +++ b/src/backend/cpu/kernel/dot.hpp @@ -0,0 +1,46 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template T +conj(T x) { return x; } + +template<> cfloat conj (cfloat c) { return std::conj(c); } +template<> cdouble conj(cdouble c) { return std::conj(c); } + +template +void dot(Array output, const Array &lhs, const Array &rhs, + af_mat_prop optLhs, af_mat_prop optRhs) +{ + int N = lhs.dims()[0]; + + T out = 0; + const T *pL = lhs.get(); + const T *pR = rhs.get(); + + for(int i = 0; i < N; i++) + out += (conjugate ? kernel::conj(pL[i]) : pL[i]) * pR[i]; + + if(both_conjugate) out = kernel::conj(out); + + *output.get() = out; + +} + +} +} From d1089f858eb15d2e74b2f63a5194ca8ab8db59a6 Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 28 Dec 2015 16:54:30 -0500 Subject: [PATCH 095/288] moved fftconvolve reorder helper fn to kernel namespace --- src/backend/cpu/fftconvolve.cpp | 32 +++----------------------- src/backend/cpu/kernel/fftconvolve.hpp | 29 +++++++++++++++++++++++ 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp index 2678c7b6f0..c0a9a41240 100644 --- a/src/backend/cpu/fftconvolve.cpp +++ b/src/backend/cpu/fftconvolve.cpp @@ -211,35 +211,9 @@ Array fftconvolve(Array const& signal, Array const& filter, Array out = createEmptyArray(oDims); - auto reorderFunc = [=](Array out, Array packed, - const Array filter, const dim_t sig_hald_d0, const dim_t fftScale, - const dim4 sig_tmp_dims, const dim4 sig_tmp_strides, - const dim4 filter_tmp_dims, const dim4 filter_tmp_strides) { - T* out_ptr = out.get(); - const af::dim4 out_dims = out.dims(); - const af::dim4 out_strides = out.strides(); - - const af::dim4 filter_dims = filter.dims(); - - convT* packed_ptr = packed.get(); - convT* sig_tmp_ptr = packed_ptr; - convT* filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3]; - - // Reorder the output - if (kind == CONVOLVE_BATCH_KERNEL) { - kernel::reorderHelper(out_ptr, out_dims, out_strides, - filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, - filter_dims, sig_half_d0, baseDim, fftScale, expand); - } else { - kernel::reorderHelper(out_ptr, out_dims, out_strides, - sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, - filter_dims, sig_half_d0, baseDim, fftScale, expand); - } - }; - - getQueue().enqueue(reorderFunc, out, packed, filter, sig_half_d0, fftScale, - sig_tmp_dims, sig_tmp_strides, - filter_tmp_dims, filter_tmp_strides); + getQueue().enqueue(kernel::reorder, out, packed, filter, + sig_half_d0, fftScale, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims, + filter_tmp_strides, expand, kind); return out; } diff --git a/src/backend/cpu/kernel/fftconvolve.hpp b/src/backend/cpu/kernel/fftconvolve.hpp index 6213cb2730..ad586f7d28 100644 --- a/src/backend/cpu/kernel/fftconvolve.hpp +++ b/src/backend/cpu/kernel/fftconvolve.hpp @@ -223,5 +223,34 @@ void reorderHelper(To* out_ptr, const af::dim4& od, const af::dim4& os, } } +template +void reorder(Array out, Array packed, + const Array filter, const dim_t sig_half_d0, const dim_t fftScale, + const dim4 sig_tmp_dims, const dim4 sig_tmp_strides, + const dim4 filter_tmp_dims, const dim4 filter_tmp_strides, + bool expand, ConvolveBatchKind kind) +{ + T* out_ptr = out.get(); + const af::dim4 out_dims = out.dims(); + const af::dim4 out_strides = out.strides(); + + const af::dim4 filter_dims = filter.dims(); + + convT* packed_ptr = packed.get(); + convT* sig_tmp_ptr = packed_ptr; + convT* filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3]; + + // Reorder the output + if (kind == CONVOLVE_BATCH_KERNEL) { + reorderHelper(out_ptr, out_dims, out_strides, + filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, + filter_dims, sig_half_d0, baseDim, fftScale, expand); + } else { + reorderHelper(out_ptr, out_dims, out_strides, + sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, + filter_dims, sig_half_d0, baseDim, fftScale, expand); + } +} + } } From 4f8b3fad7de425ceca4ce80e40163c9d8f9c6160 Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 28 Dec 2015 16:55:54 -0500 Subject: [PATCH 096/288] fixed cpu::kernel::dot fn signature --- src/backend/cpu/kernel/dot.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/cpu/kernel/dot.hpp b/src/backend/cpu/kernel/dot.hpp index ef518413c7..71f2c6f959 100644 --- a/src/backend/cpu/kernel/dot.hpp +++ b/src/backend/cpu/kernel/dot.hpp @@ -24,7 +24,7 @@ template<> cfloat conj (cfloat c) { return std::conj(c); } template<> cdouble conj(cdouble c) { return std::conj(c); } template -void dot(Array output, const Array &lhs, const Array &rhs, +void dot(Array output, const Array lhs, const Array rhs, af_mat_prop optLhs, af_mat_prop optRhs) { int N = lhs.dims()[0]; From 95d934613425559fa9048433bfe77bb8f151c18f Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 28 Dec 2015 17:46:06 -0500 Subject: [PATCH 097/288] Added ENQUEUE macro in cpu backend this macro takes care of asynchronous kernel launch and calls sync on the queue when in debug mode. --- src/backend/cpu/Array.cpp | 5 ++--- src/backend/cpu/approx.cpp | 19 ++++++++-------- src/backend/cpu/assign.cpp | 5 ++--- src/backend/cpu/bilateral.cpp | 5 ++--- src/backend/cpu/blas.cpp | 13 ++++++----- src/backend/cpu/cholesky.cpp | 5 ++--- src/backend/cpu/convolve.cpp | 7 +++--- src/backend/cpu/copy.cpp | 9 ++++---- src/backend/cpu/debug_cpu.hpp | 31 +++++++++++++++++++++++++++ src/backend/cpu/diagonal.cpp | 7 +++--- src/backend/cpu/diff.cpp | 7 +++--- src/backend/cpu/fast.cpp | 3 +-- src/backend/cpu/fft.cpp | 9 ++++---- src/backend/cpu/fftconvolve.cpp | 15 ++++++------- src/backend/cpu/gradient.cpp | 5 ++--- src/backend/cpu/harris.cpp | 13 ++++++----- src/backend/cpu/hist_graphics.cpp | 3 +-- src/backend/cpu/histogram.cpp | 5 ++--- src/backend/cpu/homography.cpp | 3 +-- src/backend/cpu/hsv_rgb.cpp | 7 +++--- src/backend/cpu/identity.cpp | 5 ++--- src/backend/cpu/iir.cpp | 5 ++--- src/backend/cpu/image.cpp | 3 +-- src/backend/cpu/index.cpp | 5 ++--- src/backend/cpu/inverse.cpp | 5 ++--- src/backend/cpu/iota.cpp | 5 ++--- src/backend/cpu/ireduce.cpp | 5 ++--- src/backend/cpu/join.cpp | 25 +++++++++++---------- src/backend/cpu/lookup.cpp | 5 ++--- src/backend/cpu/lu.cpp | 9 ++++---- src/backend/cpu/match_template.cpp | 5 ++--- src/backend/cpu/meanshift.cpp | 5 ++--- src/backend/cpu/medfilt.cpp | 5 ++--- src/backend/cpu/memory.cpp | 3 +-- src/backend/cpu/morph.cpp | 7 +++--- src/backend/cpu/nearest_neighbour.cpp | 9 ++++---- src/backend/cpu/orb.cpp | 3 +-- src/backend/cpu/platform.cpp | 3 +-- src/backend/cpu/plot.cpp | 3 +-- src/backend/cpu/plot3.cpp | 3 +-- src/backend/cpu/qr.cpp | 7 +++--- src/backend/cpu/random.cpp | 11 +++++----- src/backend/cpu/range.cpp | 11 +++++----- src/backend/cpu/reduce.cpp | 5 ++--- src/backend/cpu/regions.cpp | 5 ++--- src/backend/cpu/reorder.cpp | 5 ++--- src/backend/cpu/resize.cpp | 9 ++++---- src/backend/cpu/rotate.cpp | 9 ++++---- src/backend/cpu/scan.cpp | 11 +++++----- src/backend/cpu/select.cpp | 7 +++--- src/backend/cpu/set.cpp | 3 +-- src/backend/cpu/shift.cpp | 5 ++--- src/backend/cpu/sobel.cpp | 7 +++--- src/backend/cpu/solve.cpp | 11 +++++----- src/backend/cpu/sort.cpp | 5 ++--- src/backend/cpu/sort_by_key.cpp | 5 ++--- src/backend/cpu/sort_index.cpp | 5 ++--- src/backend/cpu/surface.cpp | 3 +-- src/backend/cpu/susan.cpp | 7 +++--- src/backend/cpu/svd.cpp | 5 ++--- src/backend/cpu/tile.cpp | 5 ++--- src/backend/cpu/transform.cpp | 9 ++++---- src/backend/cpu/transpose.cpp | 7 +++--- src/backend/cpu/triangle.cpp | 5 ++--- src/backend/cpu/unwrap.cpp | 7 +++--- src/backend/cpu/where.cpp | 3 +-- src/backend/cpu/wrap.cpp | 7 +++--- 67 files changed, 219 insertions(+), 254 deletions(-) create mode 100644 src/backend/cpu/debug_cpu.hpp diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 40d25aca6f..34c99e4566 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -15,8 +15,7 @@ #include #include #include -#include -#include +#include #include #include @@ -78,7 +77,7 @@ void Array::eval() data = std::shared_ptr(memAlloc(elements()), memFree); - getQueue().enqueue(kernel::evalArray, *this); + ENQUEUE(kernel::evalArray, *this); ready = true; Node_ptr prev = node; diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp index 7e65486a66..57d3cc4c45 100644 --- a/src/backend/cpu/approx.cpp +++ b/src/backend/cpu/approx.cpp @@ -11,8 +11,7 @@ #include #include #include -#include -#include +#include namespace cpu { @@ -31,12 +30,12 @@ Array approx1(const Array &in, const Array &pos, switch(method) { case AF_INTERP_NEAREST: - getQueue().enqueue(kernel::approx1, - out, in, pos, offGrid); + ENQUEUE(kernel::approx1, + out, in, pos, offGrid); break; case AF_INTERP_LINEAR: - getQueue().enqueue(kernel::approx1, - out, in, pos, offGrid); + ENQUEUE(kernel::approx1, + out, in, pos, offGrid); break; default: break; @@ -61,12 +60,12 @@ Array approx2(const Array &in, const Array &pos0, const Array &p switch(method) { case AF_INTERP_NEAREST: - getQueue().enqueue(kernel::approx2, - out, in, pos0, pos1, offGrid); + ENQUEUE(kernel::approx2, + out, in, pos0, pos1, offGrid); break; case AF_INTERP_LINEAR: - getQueue().enqueue(kernel::approx2, - out, in, pos0, pos1, offGrid); + ENQUEUE(kernel::approx2, + out, in, pos0, pos1, offGrid); break; default: break; diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp index 95bb7e5dd4..df903449a0 100644 --- a/src/backend/cpu/assign.cpp +++ b/src/backend/cpu/assign.cpp @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include namespace cpu { @@ -48,7 +47,7 @@ void assign(Array& out, const af_index_t idxrs[], const Array& rhs) } } - getQueue().enqueue(kernel::assign, out, rhs, std::move(isSeq), + ENQUEUE(kernel::assign, out, rhs, std::move(isSeq), std::move(seqs), std::move(idxArrs)); } diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp index bc3ad6e14b..ceb8be95d9 100644 --- a/src/backend/cpu/bilateral.cpp +++ b/src/backend/cpu/bilateral.cpp @@ -15,8 +15,7 @@ #include #include #include -#include -#include +#include using af::dim4; @@ -29,7 +28,7 @@ Array bilateral(const Array &in, const float &s_sigma, const fl in.eval(); const dim4 dims = in.dims(); Array out = createEmptyArray(dims); - getQueue().enqueue(kernel::bilateral, out, in, s_sigma, c_sigma); + ENQUEUE(kernel::bilateral, out, in, s_sigma, c_sigma); return out; } diff --git a/src/backend/cpu/blas.cpp b/src/backend/cpu/blas.cpp index d6f5dee203..70c8d9ca77 100644 --- a/src/backend/cpu/blas.cpp +++ b/src/backend/cpu/blas.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include namespace cpu { @@ -194,7 +193,7 @@ Array matmul(const Array &lhs, const Array &rhs, reinterpret_cast(output.get()), output.dims()[0]); } }; - getQueue().enqueue(func, out, lhs, rhs); + ENQUEUE(func, out, lhs, rhs); return out; } @@ -208,13 +207,13 @@ Array dot(const Array &lhs, const Array &rhs, Array out = createEmptyArray(af::dim4(1)); if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - getQueue().enqueue(kernel::dot, out, lhs, rhs, optLhs, optRhs); + ENQUEUE(kernel::dot, out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - getQueue().enqueue(kernel::dot,out, lhs, rhs, optLhs, optRhs); + ENQUEUE(kernel::dot,out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - getQueue().enqueue(kernel::dot,out, rhs, lhs, optRhs, optLhs); + ENQUEUE(kernel::dot,out, rhs, lhs, optRhs, optLhs); } else { - getQueue().enqueue(kernel::dot,out, lhs, rhs, optLhs, optRhs); + ENQUEUE(kernel::dot,out, lhs, rhs, optLhs, optRhs); } return out; } diff --git a/src/backend/cpu/cholesky.cpp b/src/backend/cpu/cholesky.cpp index ce11867186..b21d9c8fd0 100644 --- a/src/backend/cpu/cholesky.cpp +++ b/src/backend/cpu/cholesky.cpp @@ -19,8 +19,7 @@ #include #include #include -#include -#include +#include namespace cpu { @@ -75,7 +74,7 @@ int cholesky_inplace(Array &in, const bool is_upper) info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, N, in.get(), in.strides()[1]); }; - getQueue().enqueue(func, info, in); + ENQUEUE(func, info, in); getQueue().sync(); return info; diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp index 218ba8e3c0..cf241c3eaa 100644 --- a/src/backend/cpu/convolve.cpp +++ b/src/backend/cpu/convolve.cpp @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -51,7 +50,7 @@ Array convolve(Array const& signal, Array const& filter, ConvolveBat Array out = createEmptyArray(oDims); - getQueue().enqueue(kernel::convolve_nd,out, signal, filter, kind); + ENQUEUE(kernel::convolve_nd,out, signal, filter, kind); return out; } @@ -81,7 +80,7 @@ Array convolve2(Array const& signal, Array const& c_filter, Array out = createEmptyArray(oDims); - getQueue().enqueue(kernel::convolve2, out, signal, c_filter, r_filter, tDims); + ENQUEUE(kernel::convolve2, out, signal, c_filter, r_filter, tDims); return out; } diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 9f6068dd65..8085a0fdb5 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -51,7 +50,7 @@ template void multiply_inplace(Array &in, double val) { in.eval(); - getQueue().enqueue(kernel::copy, in, in, 0, val); + ENQUEUE(kernel::copy, in, in, 0, val); } template @@ -63,7 +62,7 @@ Array padArray(Array const &in, dim4 const &dims, in.eval(); // FIXME: getQueue().sync(); - getQueue().enqueue(kernel::copy, ret, in, outType(default_value), factor); + ENQUEUE(kernel::copy, ret, in, outType(default_value), factor); return ret; } @@ -72,7 +71,7 @@ void copyArray(Array &out, Array const &in) { out.eval(); in.eval(); - getQueue().enqueue(kernel::copy, out, in, scalar(0), 1.0); + ENQUEUE(kernel::copy, out, in, scalar(0), 1.0); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/debug_cpu.hpp b/src/backend/cpu/debug_cpu.hpp new file mode 100644 index 0000000000..b1d8e17484 --- /dev/null +++ b/src/backend/cpu/debug_cpu.hpp @@ -0,0 +1,31 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include + +#ifndef NDEBUG + +#define POST_LAUNCH_CHECK() do { \ + getQueue().sync(); \ + } while(0) \ + +#else + +#define POST_LAUNCH_CHECK() //no-op + +#endif + +#define ENQUEUE(...) \ + do { \ + getQueue().enqueue(__VA_ARGS__); \ + POST_LAUNCH_CHECK(); \ + } while(0) diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp index 6c20f2e7f2..6fd918d66d 100644 --- a/src/backend/cpu/diagonal.cpp +++ b/src/backend/cpu/diagonal.cpp @@ -15,8 +15,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -31,7 +30,7 @@ Array diagCreate(const Array &in, const int num) int batch = in.dims()[1]; Array out = createEmptyArray(dim4(size, size, batch)); - getQueue().enqueue(kernel::diagCreate, out, in, num); + ENQUEUE(kernel::diagCreate, out, in, num); return out; } @@ -45,7 +44,7 @@ Array diagExtract(const Array &in, const int num) dim_t size = std::max(idims[0], idims[1]) - std::abs(num); Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); - getQueue().enqueue(kernel::diagExtract, out, in, num); + ENQUEUE(kernel::diagExtract, out, in, num); return out; } diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp index 3f639ca46f..efab130cc6 100644 --- a/src/backend/cpu/diff.cpp +++ b/src/backend/cpu/diff.cpp @@ -9,8 +9,7 @@ #include #include -#include -#include +#include #include namespace cpu @@ -27,7 +26,7 @@ Array diff1(const Array &in, const int dim) Array outArray = createEmptyArray(dims); - getQueue().enqueue(kernel::diff1, outArray, in, dim); + ENQUEUE(kernel::diff1, outArray, in, dim); return outArray; } @@ -43,7 +42,7 @@ Array diff2(const Array &in, const int dim) Array outArray = createEmptyArray(dims); - getQueue().enqueue(kernel::diff2, outArray, in, dim); + ENQUEUE(kernel::diff2, outArray, in, dim); return outArray; } diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp index 42607d888f..1b3a7aa973 100644 --- a/src/backend/cpu/fast.cpp +++ b/src/backend/cpu/fast.cpp @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp index 2edced2219..1282963003 100644 --- a/src/backend/cpu/fft.cpp +++ b/src/backend/cpu/fft.cpp @@ -15,8 +15,7 @@ #include #include #include -#include -#include +#include using af::dim4; @@ -27,7 +26,7 @@ template void fft_inplace(Array &in) { in.eval(); - getQueue().enqueue(kernel::fft_inplace, in); + ENQUEUE(kernel::fft_inplace, in); } template @@ -39,7 +38,7 @@ Array fft_r2c(const Array &in) odims[0] = odims[0] / 2 + 1; Array out = createEmptyArray(odims); - getQueue().enqueue(kernel::fft_r2c, out, in); + ENQUEUE(kernel::fft_r2c, out, in); return out; } @@ -50,7 +49,7 @@ Array fft_c2r(const Array &in, const dim4 &odims) in.eval(); Array out = createEmptyArray(odims); - getQueue().enqueue(kernel::fft_c2r, out, in, odims); + ENQUEUE(kernel::fft_c2r, out, in, odims); return out; } diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp index c0a9a41240..aac66cdbe4 100644 --- a/src/backend/cpu/fftconvolve.cpp +++ b/src/backend/cpu/fftconvolve.cpp @@ -17,8 +17,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -84,11 +83,11 @@ Array fftconvolve(Array const& signal, Array const& filter, // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s - getQueue().enqueue(kernel::packData, packed, sig_tmp_dims, sig_tmp_strides, signal); + ENQUEUE(kernel::packData, packed, sig_tmp_dims, sig_tmp_strides, signal); // Pad filter array with 0s const dim_t offset = sig_tmp_strides[3]*sig_tmp_dims[3]; - getQueue().enqueue(kernel::padArray, packed, filter_tmp_dims, filter_tmp_strides, + ENQUEUE(kernel::padArray, packed, filter_tmp_dims, filter_tmp_strides, filter, offset); dim4 fftDims(1, 1, 1, 1); @@ -138,10 +137,10 @@ Array fftconvolve(Array const& signal, Array const& filter, fftwf_destroy_plan(plan); } }; - getQueue().enqueue(upstream_dft, packed, fftDims); + ENQUEUE(upstream_dft, packed, fftDims); // Multiply filter and signal FFT arrays - getQueue().enqueue(kernel::complexMultiply, packed, + ENQUEUE(kernel::complexMultiply, packed, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims, filter_tmp_strides, kind, offset); @@ -189,7 +188,7 @@ Array fftconvolve(Array const& signal, Array const& filter, fftwf_destroy_plan(plan); } }; - getQueue().enqueue(upstream_idft, packed, fftDims); + ENQUEUE(upstream_idft, packed, fftDims); // Compute output dimensions dim4 oDims(1); @@ -211,7 +210,7 @@ Array fftconvolve(Array const& signal, Array const& filter, Array out = createEmptyArray(oDims); - getQueue().enqueue(kernel::reorder, out, packed, filter, + ENQUEUE(kernel::reorder, out, packed, filter, sig_half_d0, fftScale, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims, filter_tmp_strides, expand, kind); diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp index d1a8b0d2c9..57776e5750 100644 --- a/src/backend/cpu/gradient.cpp +++ b/src/backend/cpu/gradient.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -26,7 +25,7 @@ void gradient(Array &grad0, Array &grad1, const Array &in) grad1.eval(); in.eval(); - getQueue().enqueue(kernel::gradient, grad0, grad1, in); + ENQUEUE(kernel::gradient, grad0, grad1, in); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/harris.cpp b/src/backend/cpu/harris.cpp index 905b0467c7..07b9bed516 100644 --- a/src/backend/cpu/harris.cpp +++ b/src/backend/cpu/harris.cpp @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -53,14 +52,14 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out Array iy = createEmptyArray(idims); // Compute first order derivatives - getQueue().enqueue(gradient, iy, ix, in); + ENQUEUE(gradient, iy, ix, in); Array ixx = createEmptyArray(idims); Array ixy = createEmptyArray(idims); Array iyy = createEmptyArray(idims); // Compute second-order derivatives - getQueue().enqueue(kernel::second_order_deriv, ixx, ixy, iyy, in.elements(), ix, iy); + ENQUEUE(kernel::second_order_deriv, ixx, ixy, iyy, in.elements(), ix, iy); // Convolve second-order derivatives with proper window filter ixx = convolve2(ixx, filter, filter); @@ -71,7 +70,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out Array responses = createEmptyArray(dim4(in.elements())); - getQueue().enqueue(kernel::harris_responses, responses, idims[0], idims[1], + ENQUEUE(kernel::harris_responses, responses, idims[0], idims[1], ixx, ixy, iyy, k_thr, border_len); Array xCorners = createEmptyArray(dim4(corner_lim)); @@ -105,7 +104,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out resp_out = createEmptyArray(dim4(corners_out)); // Keep only the corners with higher Harris responses - getQueue().enqueue(kernel::keep_corners, x_out, y_out, resp_out, xCorners, yCorners, + ENQUEUE(kernel::keep_corners, x_out, y_out, resp_out, xCorners, yCorners, harris_sorted, harris_idx, corners_out); } else if (max_corners == 0 && corners_found < corner_lim) { x_out = createEmptyArray(dim4(corners_out)); @@ -120,7 +119,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out memcpy(y_out.get(), y_crnrs.get(), corners_out * sizeof(float)); memcpy(outResponses.get(), inResponses.get(), corners_out * sizeof(float)); }; - getQueue().enqueue(copyFunc, x_out, y_out, resp_out, + ENQUEUE(copyFunc, x_out, y_out, resp_out, xCorners, yCorners, respCorners, corners_out); } else { x_out = xCorners; diff --git a/src/backend/cpu/hist_graphics.cpp b/src/backend/cpu/hist_graphics.cpp index 56f7646b61..c58f5c687e 100644 --- a/src/backend/cpu/hist_graphics.cpp +++ b/src/backend/cpu/hist_graphics.cpp @@ -11,8 +11,7 @@ #include #include -#include -#include +#include namespace cpu { diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp index 19314e052a..2571f3e4d0 100644 --- a/src/backend/cpu/histogram.cpp +++ b/src/backend/cpu/histogram.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -32,7 +31,7 @@ Array histogram(const Array &in, Array out = createValueArray(outDims, outType(0)); out.eval(); - getQueue().enqueue(kernel::histogram, + ENQUEUE(kernel::histogram, out, in, nbins, minval, maxval); return out; diff --git a/src/backend/cpu/homography.cpp b/src/backend/cpu/homography.cpp index d936e21b4c..147f5e8751 100644 --- a/src/backend/cpu/homography.cpp +++ b/src/backend/cpu/homography.cpp @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include using af::dim4; diff --git a/src/backend/cpu/hsv_rgb.cpp b/src/backend/cpu/hsv_rgb.cpp index c0f19db773..da5dbe0594 100644 --- a/src/backend/cpu/hsv_rgb.cpp +++ b/src/backend/cpu/hsv_rgb.cpp @@ -11,8 +11,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -27,7 +26,7 @@ Array hsv2rgb(const Array& in) Array out = createEmptyArray(in.dims()); - getQueue().enqueue(kernel::hsv2rgb, out, in); + ENQUEUE(kernel::hsv2rgb, out, in); return out; } @@ -39,7 +38,7 @@ Array rgb2hsv(const Array& in) Array out = createEmptyArray(in.dims()); - getQueue().enqueue(kernel::rgb2hsv, out, in); + ENQUEUE(kernel::rgb2hsv, out, in); return out; } diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp index 949fceda81..071bb04642 100644 --- a/src/backend/cpu/identity.cpp +++ b/src/backend/cpu/identity.cpp @@ -10,8 +10,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -22,7 +21,7 @@ Array identity(const dim4& dims) { Array out = createEmptyArray(dims); - getQueue().enqueue(kernel::identity, out); + ENQUEUE(kernel::identity, out); return out; } diff --git a/src/backend/cpu/iir.cpp b/src/backend/cpu/iir.cpp index 225f39b859..cb390b3018 100644 --- a/src/backend/cpu/iir.cpp +++ b/src/backend/cpu/iir.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -42,7 +41,7 @@ Array iir(const Array &b, const Array &a, const Array &x) Array y = createEmptyArray(c.dims()); - getQueue().enqueue(kernel::iir, y, c, a); + ENQUEUE(kernel::iir, y, c, a); return y; } diff --git a/src/backend/cpu/image.cpp b/src/backend/cpu/image.cpp index 767f9d42f1..d23ba80ba8 100644 --- a/src/backend/cpu/image.cpp +++ b/src/backend/cpu/image.cpp @@ -16,8 +16,7 @@ #include #include #include -#include -#include +#include using af::dim4; diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp index bd569de44a..9c951ff0d3 100644 --- a/src/backend/cpu/index.cpp +++ b/src/backend/cpu/index.cpp @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include #include @@ -58,7 +57,7 @@ Array index(const Array& in, const af_index_t idxrs[]) Array out = createEmptyArray(oDims); - getQueue().enqueue(kernel::index, out, in, std::move(isSeq), std::move(seqs), std::move(idxArrs)); + ENQUEUE(kernel::index, out, in, std::move(isSeq), std::move(seqs), std::move(idxArrs)); return out; } diff --git a/src/backend/cpu/inverse.cpp b/src/backend/cpu/inverse.cpp index 987ba01c53..71cc9fefca 100644 --- a/src/backend/cpu/inverse.cpp +++ b/src/backend/cpu/inverse.cpp @@ -23,8 +23,7 @@ #include #include #include -#include -#include +#include namespace cpu { @@ -68,7 +67,7 @@ Array inverse(const Array &in) A.get(), A.strides()[1], pivot.get()); }; - getQueue().enqueue(func, A, pivot, M); + ENQUEUE(func, A, pivot, M); return A; } diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp index 41f0c9c518..124ec5c48a 100644 --- a/src/backend/cpu/iota.cpp +++ b/src/backend/cpu/iota.cpp @@ -10,8 +10,7 @@ #include #include #include -#include -#include +#include #include using namespace std; @@ -26,7 +25,7 @@ Array iota(const dim4 &dims, const dim4 &tile_dims) Array out = createEmptyArray(outdims); - getQueue().enqueue(kernel::iota, out, dims, tile_dims); + ENQUEUE(kernel::iota, out, dims, tile_dims); return out; } diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp index f1efcf646a..9de4a781b3 100644 --- a/src/backend/cpu/ireduce.cpp +++ b/src/backend/cpu/ireduce.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -40,7 +39,7 @@ void ireduce(Array &out, Array &loc, const Array &in, const int dim) , kernel::ireduce_dim() , kernel::ireduce_dim()}; - getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); + ENQUEUE(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); } template diff --git a/src/backend/cpu/join.cpp b/src/backend/cpu/join.cpp index e39280c943..6c9ba8ff9b 100644 --- a/src/backend/cpu/join.cpp +++ b/src/backend/cpu/join.cpp @@ -9,8 +9,7 @@ #include #include -#include -#include +#include #include namespace cpu @@ -38,7 +37,7 @@ Array join(const int dim, const Array &first, const Array &second) Array out = createEmptyArray(odims); - getQueue().enqueue(kernel::join, out, dim, first, second); + ENQUEUE(kernel::join, out, dim, first, second); return out; } @@ -72,34 +71,34 @@ Array join(const int dim, const std::vector> &inputs) switch(n_arrays) { case 1: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 2: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 3: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 4: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 5: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 6: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 7: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 8: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 9: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; case 10: - getQueue().enqueue(kernel::join, dim, out, inputs); + ENQUEUE(kernel::join, dim, out, inputs); break; } diff --git a/src/backend/cpu/lookup.cpp b/src/backend/cpu/lookup.cpp index 457cdaea5a..4cc5359002 100644 --- a/src/backend/cpu/lookup.cpp +++ b/src/backend/cpu/lookup.cpp @@ -9,8 +9,7 @@ #include #include -#include -#include +#include #include namespace cpu @@ -30,7 +29,7 @@ Array lookup(const Array &input, const Array &indices, const Array out = createEmptyArray(oDims); - getQueue().enqueue(kernel::lookup, out, input, indices, dim); + ENQUEUE(kernel::lookup, out, input, indices, dim); return out; } diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index f0e1593f1a..551c9c98e2 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -17,8 +17,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -59,7 +58,7 @@ void lu(Array &lower, Array &upper, Array &pivot, const Array &in) lower = createEmptyArray(ldims); upper = createEmptyArray(udims); - getQueue().enqueue(kernel::lu_split, lower, upper, in_copy); + ENQUEUE(kernel::lu_split, lower, upper, in_copy); } template @@ -74,11 +73,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) dim4 iDims = in.dims(); getrf_func()(AF_LAPACK_COL_MAJOR, iDims[0], iDims[1], in.get(), in.strides()[1], pivot.get()); }; - getQueue().enqueue(func, in, pivot); + ENQUEUE(func, in, pivot); if(convert_pivot) { Array p = range(dim4(iDims[0]), 0); - getQueue().enqueue(kernel::convertPivot, p, pivot); + ENQUEUE(kernel::convertPivot, p, pivot); return p; } else { return pivot; diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp index e5b030be64..724b773638 100644 --- a/src/backend/cpu/match_template.cpp +++ b/src/backend/cpu/match_template.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -29,7 +28,7 @@ Array match_template(const Array &sImg, const Array &tImg) Array out = createEmptyArray(sImg.dims()); - getQueue().enqueue(kernel::matchTemplate, out, sImg, tImg); + ENQUEUE(kernel::matchTemplate, out, sImg, tImg); return out; } diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp index 6c3417a62e..f4a0b29e86 100644 --- a/src/backend/cpu/meanshift.cpp +++ b/src/backend/cpu/meanshift.cpp @@ -16,8 +16,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -33,7 +32,7 @@ Array meanshift(const Array &in, const float &s_sigma, const float &c_sig Array out = createEmptyArray(in.dims()); - getQueue().enqueue(kernel::meanShift, out, in, s_sigma, c_sigma, iter); + ENQUEUE(kernel::meanShift, out, in, s_sigma, c_sigma, iter); return out; } diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp index 06cc0dff44..9e761c6cc0 100644 --- a/src/backend/cpu/medfilt.cpp +++ b/src/backend/cpu/medfilt.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -28,7 +27,7 @@ Array medfilt(const Array &in, dim_t w_len, dim_t w_wid) Array out = createEmptyArray(in.dims()); - getQueue().enqueue(kernel::medfilt, out, in, w_len, w_wid); + ENQUEUE(kernel::medfilt, out, in, w_len, w_wid); return out; } diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index e11f994eef..79f2e57a0c 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include namespace cpu { diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp index 462319d0af..337e8a9574 100644 --- a/src/backend/cpu/morph.cpp +++ b/src/backend/cpu/morph.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -30,7 +29,7 @@ Array morph(const Array &in, const Array &mask) Array out = createEmptyArray(in.dims()); - getQueue().enqueue(kernel::morph, out, in, mask); + ENQUEUE(kernel::morph, out, in, mask); return out; } @@ -43,7 +42,7 @@ Array morph3d(const Array &in, const Array &mask) Array out = createEmptyArray(in.dims()); - getQueue().enqueue(kernel::morph3d, out, in, mask); + ENQUEUE(kernel::morph3d, out, in, mask); return out; } diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp index 82925622ae..a3c2bb1ea9 100644 --- a/src/backend/cpu/nearest_neighbour.cpp +++ b/src/backend/cpu/nearest_neighbour.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -43,13 +42,13 @@ void nearest_neighbour(Array& idx, Array& dist, switch(dist_type) { case AF_SAD: - getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + ENQUEUE(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; case AF_SSD: - getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + ENQUEUE(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; case AF_SHD: - getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + ENQUEUE(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED); diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp index 5dd9326134..649619e143 100644 --- a/src/backend/cpu/orb.cpp +++ b/src/backend/cpu/orb.cpp @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index c4ac0af3ab..98cfad4b53 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -9,9 +9,8 @@ #include #include -#include +#include #include -#include #include #include #include diff --git a/src/backend/cpu/plot.cpp b/src/backend/cpu/plot.cpp index 9cc7d9d2b9..8afdea288f 100644 --- a/src/backend/cpu/plot.cpp +++ b/src/backend/cpu/plot.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include using af::dim4; diff --git a/src/backend/cpu/plot3.cpp b/src/backend/cpu/plot3.cpp index 35a7b2500d..c7beed69d6 100644 --- a/src/backend/cpu/plot3.cpp +++ b/src/backend/cpu/plot3.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include using af::dim4; diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp index 78631fccfa..ca04ec9c20 100644 --- a/src/backend/cpu/qr.cpp +++ b/src/backend/cpu/qr.cpp @@ -17,8 +17,7 @@ #include #include #include -#include -#include +#include namespace cpu { @@ -79,7 +78,7 @@ void qr(Array &q, Array &r, Array &t, const Array &in) gqr_func()(AF_LAPACK_COL_MAJOR, M, M, min(M, N), q.get(), q.strides()[1], t.get()); }; q.resetDims(dim4(M, M)); - getQueue().enqueue(func, q, t, M, N); + ENQUEUE(func, q, t, M, N); } template @@ -95,7 +94,7 @@ Array qr_inplace(Array &in) auto func = [=] (Array in, Array t, int M, int N) { geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, in.get(), in.strides()[1], t.get()); }; - getQueue().enqueue(func, in, t, M, N); + ENQUEUE(func, in, t, M, N); return t; } diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp index 55cf2956a8..f49420f13d 100644 --- a/src/backend/cpu/random.cpp +++ b/src/backend/cpu/random.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -23,7 +22,7 @@ template Array randu(const af::dim4 &dims) { Array outArray = createEmptyArray(dims); - getQueue().enqueue(kernel::randu, outArray); + ENQUEUE(kernel::randu, outArray); return outArray; } @@ -46,7 +45,7 @@ template Array randn(const af::dim4 &dims) { Array outArray = createEmptyArray(dims); - getQueue().enqueue(kernel::randn, outArray); + ENQUEUE(kernel::randn, outArray); return outArray; } @@ -81,7 +80,7 @@ Array randu(const af::dim4 &dims) outPtr[i] = gen() > 0.5; } }; - getQueue().enqueue(func, outArray); + ENQUEUE(func, outArray); return outArray; } @@ -93,7 +92,7 @@ void setSeed(const uintl seed) kernel::is_first = false; kernel::gen_seed = seed; }; - getQueue().enqueue(f, seed); + ENQUEUE(f, seed); } uintl getSeed() diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp index b5ba5f89c4..6be78d5d0e 100644 --- a/src/backend/cpu/range.cpp +++ b/src/backend/cpu/range.cpp @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -33,10 +32,10 @@ Array range(const dim4& dims, const int seq_dim) Array out = createEmptyArray(dims); switch(_seq_dim) { - case 0: getQueue().enqueue(kernel::range, out); break; - case 1: getQueue().enqueue(kernel::range, out); break; - case 2: getQueue().enqueue(kernel::range, out); break; - case 3: getQueue().enqueue(kernel::range, out); break; + case 0: ENQUEUE(kernel::range, out); break; + case 1: ENQUEUE(kernel::range, out); break; + case 2: ENQUEUE(kernel::range, out); break; + case 3: ENQUEUE(kernel::range, out); break; default : AF_ERROR("Invalid rep selection", AF_ERR_ARG); } diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp index cd44b5e2d0..90ad1f9023 100644 --- a/src/backend/cpu/reduce.cpp +++ b/src/backend/cpu/reduce.cpp @@ -15,8 +15,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -56,7 +55,7 @@ Array reduce(const Array &in, const int dim, bool change_nan, double nan , kernel::reduce_dim() , kernel::reduce_dim()}; - getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval); + ENQUEUE(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval); return out; } diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp index ffac11c01d..eafc161ff5 100644 --- a/src/backend/cpu/regions.cpp +++ b/src/backend/cpu/regions.cpp @@ -17,8 +17,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -34,7 +33,7 @@ Array regions(const Array &in, af_connectivity connectivity) Array out = createValueArray(in.dims(), (T)0); out.eval(); - getQueue().enqueue(kernel::regions, out, in, connectivity); + ENQUEUE(kernel::regions, out, in, connectivity); return out; } diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp index 162039b36c..237e5d687a 100644 --- a/src/backend/cpu/reorder.cpp +++ b/src/backend/cpu/reorder.cpp @@ -9,8 +9,7 @@ #include #include -#include -#include +#include #include namespace cpu @@ -27,7 +26,7 @@ Array reorder(const Array &in, const af::dim4 &rdims) oDims[i] = iDims[rdims[i]]; Array out = createEmptyArray(oDims); - getQueue().enqueue(kernel::reorder, out, in, oDims, rdims); + ENQUEUE(kernel::reorder, out, in, oDims, rdims); return out; } diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp index 9a5c85bf1e..d6349a9c0b 100644 --- a/src/backend/cpu/resize.cpp +++ b/src/backend/cpu/resize.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -32,11 +31,11 @@ Array resize(const Array &in, const dim_t odim0, const dim_t odim1, switch(method) { case AF_INTERP_NEAREST: - getQueue().enqueue(kernel::resize, out, in); break; + ENQUEUE(kernel::resize, out, in); break; case AF_INTERP_BILINEAR: - getQueue().enqueue(kernel::resize, out, in); break; + ENQUEUE(kernel::resize, out, in); break; case AF_INTERP_LOWER: - getQueue().enqueue(kernel::resize, out, in); break; + ENQUEUE(kernel::resize, out, in); break; default: break; } return out; diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp index e81ee04c80..289f3697a0 100644 --- a/src/backend/cpu/rotate.cpp +++ b/src/backend/cpu/rotate.cpp @@ -9,8 +9,7 @@ #include #include -#include -#include +#include #include "transform_interp.hpp" #include @@ -27,13 +26,13 @@ Array rotate(const Array &in, const float theta, const af::dim4 &odims, switch(method) { case AF_INTERP_NEAREST: - getQueue().enqueue(kernel::rotate, out, in, theta); + ENQUEUE(kernel::rotate, out, in, theta); break; case AF_INTERP_BILINEAR: - getQueue().enqueue(kernel::rotate, out, in, theta); + ENQUEUE(kernel::rotate, out, in, theta); break; case AF_INTERP_LOWER: - getQueue().enqueue(kernel::rotate, out, in, theta); + ENQUEUE(kernel::rotate, out, in, theta); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp index 615744fd67..adeb3d23b7 100644 --- a/src/backend/cpu/scan.cpp +++ b/src/backend/cpu/scan.cpp @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -34,19 +33,19 @@ Array scan(const Array& in, const int dim) switch (in.ndims()) { case 1: kernel::scan_dim func1; - getQueue().enqueue(func1, out, 0, in, 0, dim); + ENQUEUE(func1, out, 0, in, 0, dim); break; case 2: kernel::scan_dim func2; - getQueue().enqueue(func2, out, 0, in, 0, dim); + ENQUEUE(func2, out, 0, in, 0, dim); break; case 3: kernel::scan_dim func3; - getQueue().enqueue(func3, out, 0, in, 0, dim); + ENQUEUE(func3, out, 0, in, 0, dim); break; case 4: kernel::scan_dim func4; - getQueue().enqueue(func4, out, 0, in, 0, dim); + ENQUEUE(func4, out, 0, in, 0, dim); break; } diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp index d9a6795a41..4f845bc084 100644 --- a/src/backend/cpu/select.cpp +++ b/src/backend/cpu/select.cpp @@ -10,8 +10,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -26,7 +25,7 @@ void select(Array &out, const Array &cond, const Array &a, const Arr cond.eval(); a.eval(); b.eval(); - getQueue().enqueue(kernel::select, out, cond, a, b); + ENQUEUE(kernel::select, out, cond, a, b); } template @@ -35,7 +34,7 @@ void select_scalar(Array &out, const Array &cond, const Array &a, co out.eval(); cond.eval(); a.eval(); - getQueue().enqueue(kernel::select_scalar, out, cond, a, b); + ENQUEUE(kernel::select_scalar, out, cond, a, b); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp index d6321bba55..49ce186412 100644 --- a/src/backend/cpu/set.cpp +++ b/src/backend/cpu/set.cpp @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include namespace cpu { diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp index eca1e5063f..fd56e4ce2e 100644 --- a/src/backend/cpu/shift.cpp +++ b/src/backend/cpu/shift.cpp @@ -9,8 +9,7 @@ #include #include -#include -#include +#include #include namespace cpu @@ -24,7 +23,7 @@ Array shift(const Array &in, const int sdims[4]) Array out = createEmptyArray(in.dims()); const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]); - getQueue().enqueue(kernel::shift, out, in, temp); + ENQUEUE(kernel::shift, out, in, temp); return out; } diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp index 161266d7cf..86c7363c6d 100644 --- a/src/backend/cpu/sobel.cpp +++ b/src/backend/cpu/sobel.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include using af::dim4; @@ -32,8 +31,8 @@ sobelDerivatives(const Array &img, const unsigned &ker_size) Array dx = createEmptyArray(img.dims()); Array dy = createEmptyArray(img.dims()); - getQueue().enqueue(kernel::derivative, dx, img); - getQueue().enqueue(kernel::derivative, dy, img); + ENQUEUE(kernel::derivative, dx, img); + ENQUEUE(kernel::derivative, dy, img); return std::make_pair(dx, dy); } diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp index 0243088fb3..5d1ec3bba3 100644 --- a/src/backend/cpu/solve.cpp +++ b/src/backend/cpu/solve.cpp @@ -16,8 +16,7 @@ #include #include #include -#include -#include +#include namespace cpu { @@ -88,7 +87,7 @@ Array solveLU(const Array &A, const Array &pivot, N, NRHS, A.get(), A.strides()[1], pivot.get(), B.get(), B.strides()[1]); }; - getQueue().enqueue(func, A, B, pivot, N, NRHS); + ENQUEUE(func, A, B, pivot, N, NRHS); return B; } @@ -109,7 +108,7 @@ Array triangleSolve(const Array &A, const Array &b, const af_mat_prop o A.get(), A.strides()[1], B.get(), B.strides()[1]); }; - getQueue().enqueue(func, A, B, N, NRHS, options); + ENQUEUE(func, A, B, N, NRHS, options); return B; } @@ -139,7 +138,7 @@ Array solve(const Array &a, const Array &b, const af_mat_prop options) gesv_func()(AF_LAPACK_COL_MAJOR, N, K, A.get(), A.strides()[1], pivot.get(), B.get(), B.strides()[1]); }; - getQueue().enqueue(func, A, B, pivot, N, K); + ENQUEUE(func, A, B, pivot, N, K); } else { auto func = [=] (Array A, Array B, int M, int N, int K) { int sM = A.strides()[1]; @@ -151,7 +150,7 @@ Array solve(const Array &a, const Array &b, const af_mat_prop options) B.get(), max(sM, sN)); }; B.resetDims(dim4(N, K)); - getQueue().enqueue(func, A, B, M, N, K); + ENQUEUE(func, A, B, M, N, K); } return B; diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index 6a0465cf37..104a3df2eb 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -27,7 +26,7 @@ Array sort(const Array &in, const unsigned dim) Array out = copyArray(in); switch(dim) { - case 0: getQueue().enqueue(kernel::sort0, out); break; + case 0: ENQUEUE(kernel::sort0, out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } return out; diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index 409b82538e..c6832881d8 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -9,8 +9,7 @@ #include #include -#include -#include +#include #include namespace cpu @@ -29,7 +28,7 @@ void sort_by_key(Array &okey, Array &oval, oidx.eval(); switch(dim) { - case 0: getQueue().enqueue(kernel::sort0_by_key, + case 0: ENQUEUE(kernel::sort0_by_key, okey, oval, oidx, ikey, ival); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index ed6afea814..c8c6d6e08f 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -27,7 +26,7 @@ void sort_index(Array &val, Array &idx, const Array &in, const uint val = createEmptyArray(in.dims()); idx = createEmptyArray(in.dims()); switch(dim) { - case 0: getQueue().enqueue(kernel::sort0_index, val, idx, in); break; + case 0: ENQUEUE(kernel::sort0_index, val, idx, in); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } } diff --git a/src/backend/cpu/surface.cpp b/src/backend/cpu/surface.cpp index 116c784d89..00d2b00c0f 100644 --- a/src/backend/cpu/surface.cpp +++ b/src/backend/cpu/surface.cpp @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include using af::dim4; diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp index 6e8d0fe5b0..4f1c327dd3 100644 --- a/src/backend/cpu/susan.cpp +++ b/src/backend/cpu/susan.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include using af::features; @@ -40,9 +39,9 @@ unsigned susan(Array &x_out, Array &y_out, Array &resp_out, auto corners_found= std::shared_ptr(memAlloc(1), memFree); corners_found.get()[0] = 0; - getQueue().enqueue(kernel::susan_responses, response, in, idims[0], idims[1], + ENQUEUE(kernel::susan_responses, response, in, idims[0], idims[1], radius, diff_thr, geom_thr, edge); - getQueue().enqueue(kernel::non_maximal, x_corners, y_corners, resp_corners, corners_found, + ENQUEUE(kernel::non_maximal, x_corners, y_corners, resp_corners, corners_found, idims[0], idims[1], response, edge, corner_lim); getQueue().sync(); diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp index 92912ca616..3ce627c5f9 100644 --- a/src/backend/cpu/svd.cpp +++ b/src/backend/cpu/svd.cpp @@ -15,8 +15,7 @@ #if defined(WITH_CPU_LINEAR_ALGEBRA) #include #include -#include -#include +#include namespace cpu { @@ -87,7 +86,7 @@ void svdInPlace(Array &s, Array &u, Array &vt, Array &in) s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]); #endif }; - getQueue().enqueue(func, s, u, vt, in); + ENQUEUE(func, s, u, vt, in); } template diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp index 6526917d3a..9237a79eb9 100644 --- a/src/backend/cpu/tile.cpp +++ b/src/backend/cpu/tile.cpp @@ -9,8 +9,7 @@ #include #include -#include -#include +#include #include namespace cpu @@ -31,7 +30,7 @@ Array tile(const Array &in, const af::dim4 &tileDims) Array out = createEmptyArray(oDims); - getQueue().enqueue(kernel::tile, out, in); + ENQUEUE(kernel::tile, out, in); return out; } diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp index fc7145854b..5874e7abd0 100644 --- a/src/backend/cpu/transform.cpp +++ b/src/backend/cpu/transform.cpp @@ -10,8 +10,7 @@ #include #include #include -#include -#include +#include #include "transform_interp.hpp" #include @@ -29,13 +28,13 @@ Array transform(const Array &in, const Array &transform, const af:: switch(method) { case AF_INTERP_NEAREST : - getQueue().enqueue(kernel::transform, out, in, transform, inverse); + ENQUEUE(kernel::transform, out, in, transform, inverse); break; case AF_INTERP_BILINEAR: - getQueue().enqueue(kernel::transform, out, in, transform, inverse); + ENQUEUE(kernel::transform, out, in, transform, inverse); break; case AF_INTERP_LOWER : - getQueue().enqueue(kernel::transform, out, in, transform, inverse); + ENQUEUE(kernel::transform, out, in, transform, inverse); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp index 32663e1f94..c1d5d1d236 100644 --- a/src/backend/cpu/transpose.cpp +++ b/src/backend/cpu/transpose.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -33,7 +32,7 @@ Array transpose(const Array &in, const bool conjugate) // create an array with first two dimensions swapped Array out = createEmptyArray(outDims); - getQueue().enqueue(kernel::transpose, out, in, conjugate); + ENQUEUE(kernel::transpose, out, in, conjugate); return out; } @@ -42,7 +41,7 @@ template void transpose_inplace(Array &in, const bool conjugate) { in.eval(); - getQueue().enqueue(kernel::transpose_inplace, in, conjugate); + ENQUEUE(kernel::transpose_inplace, in, conjugate); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp index 2a9553c83a..fbc7f658d0 100644 --- a/src/backend/cpu/triangle.cpp +++ b/src/backend/cpu/triangle.cpp @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -22,7 +21,7 @@ namespace cpu template void triangle(Array &out, const Array &in) { - getQueue().enqueue(kernel::triangle, out, in); + ENQUEUE(kernel::triangle, out, in); } template diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp index 1aa37a4762..d40acde555 100644 --- a/src/backend/cpu/unwrap.cpp +++ b/src/backend/cpu/unwrap.cpp @@ -11,8 +11,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -37,9 +36,9 @@ Array unwrap(const Array &in, const dim_t wx, const dim_t wy, Array outArray = createEmptyArray(odims); if (is_column) { - getQueue().enqueue(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); + ENQUEUE(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); } else { - getQueue().enqueue(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); + ENQUEUE(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); } return outArray; diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp index 018cbdfc36..734b768385 100644 --- a/src/backend/cpu/where.cpp +++ b/src/backend/cpu/where.cpp @@ -16,8 +16,7 @@ #include #include #include -#include -#include +#include using af::dim4; diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp index 07487e0d68..87de234d36 100644 --- a/src/backend/cpu/wrap.cpp +++ b/src/backend/cpu/wrap.cpp @@ -11,8 +11,7 @@ #include #include #include -#include -#include +#include #include namespace cpu @@ -34,9 +33,9 @@ Array wrap(const Array &in, in.eval(); if (is_column) { - getQueue().enqueue(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); + ENQUEUE(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); } else { - getQueue().enqueue(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); + ENQUEUE(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); } return out; From 7dad2efd3940d12eee71e3092b9cc7f93e3e1212 Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 28 Dec 2015 18:07:08 -0500 Subject: [PATCH 098/288] Removed obsolete queue sync in cpu::padArray fn --- src/backend/cpu/copy.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 8085a0fdb5..91a1513fd9 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -60,8 +60,6 @@ Array padArray(Array const &in, dim4 const &dims, Array ret = createValueArray(dims, default_value); ret.eval(); in.eval(); - // FIXME: - getQueue().sync(); ENQUEUE(kernel::copy, ret, in, outType(default_value), factor); return ret; } From 90611a24093aaa95a7aadb6a4b60cd5d98857c8f Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 28 Dec 2015 18:20:39 -0500 Subject: [PATCH 099/288] Fixed cmake condition for threads submodule check --- src/backend/cpu/CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index c2b4e97cd2..b0ab17a616 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -48,16 +48,17 @@ IF(NOT UNIX) ENDIF() SET(THREADS_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/threads") -IF(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}") +IF(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}" + AND EXISTS "${THREADS_SRC_DIR}/LICENSE") # threads submodule has been initialized # Nothing to do -ELSE(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}") +ELSE() MESSAGE(STATUS "threads submodule unavailable. Updating submodules.") EXECUTE_PROCESS( COMMAND git submodule update --init --recursive WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} ) -ENDIF(EXISTS "${THREADS_SRC_DIR}" AND IS_DIRECTORY "${THREADS_SRC_DIR}") +ENDIF() INCLUDE_DIRECTORIES( ${CMAKE_INCLUDE_PATH} From 5ecdc54b53b21b831241b2ee442a8e36e8680254 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 11:26:24 -0500 Subject: [PATCH 100/288] Added API support for perspective transform --- src/api/c/transform.cpp | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/api/c/transform.cpp b/src/api/c/transform.cpp index bacb008c78..ffd86dcd58 100644 --- a/src/api/c/transform.cpp +++ b/src/api/c/transform.cpp @@ -20,9 +20,9 @@ using namespace detail; template static inline af_array transform(const af_array in, const af_array tf, const af::dim4 &odims, - const af_interp_type method, const bool inverse) + const af_interp_type method, const bool inverse, const bool perspective) { - return getHandle(transform(getArray(in), getArray(tf), odims, method, inverse)); + return getHandle(transform(getArray(in), getArray(tf), odims, method, inverse, perspective)); } af_err af_transform(af_array *out, const af_array in, const af_array tf, @@ -41,10 +41,12 @@ af_err af_transform(af_array *out, const af_array in, const af_array tf, ARG_ASSERT(5, method == AF_INTERP_NEAREST || method == AF_INTERP_BILINEAR || method == AF_INTERP_LOWER); - DIM_ASSERT(2, (tdims[0] == 3 && tdims[1] == 2)); + DIM_ASSERT(2, (tdims[0] == 3 && (tdims[1] == 2 || tdims[1] == 3))); DIM_ASSERT(1, idims.elements() > 0); DIM_ASSERT(1, (idims.ndims() == 2 || idims.ndims() == 3)); + const bool perspective = (tdims[1] == 3) ? true : false; + dim_t o0 = odim0, o1 = odim1; dim_t o2 = idims[2] * tdims[2]; if (odim0 * odim1 == 0) { @@ -55,18 +57,18 @@ af_err af_transform(af_array *out, const af_array in, const af_array tf, af_array output = 0; switch(itype) { - case f32: output = transform(in, tf, odims, method, inverse); break; - case f64: output = transform(in, tf, odims, method, inverse); break; - case c32: output = transform(in, tf, odims, method, inverse); break; - case c64: output = transform(in, tf, odims, method, inverse); break; - case s32: output = transform(in, tf, odims, method, inverse); break; - case u32: output = transform(in, tf, odims, method, inverse); break; - case s64: output = transform(in, tf, odims, method, inverse); break; - case u64: output = transform(in, tf, odims, method, inverse); break; - case s16: output = transform(in, tf, odims, method, inverse); break; - case u16: output = transform(in, tf, odims, method, inverse); break; - case u8: output = transform(in, tf, odims, method, inverse); break; - case b8: output = transform(in, tf, odims, method, inverse); break; + case f32: output = transform(in, tf, odims, method, inverse, perspective); break; + case f64: output = transform(in, tf, odims, method, inverse, perspective); break; + case c32: output = transform(in, tf, odims, method, inverse, perspective); break; + case c64: output = transform(in, tf, odims, method, inverse, perspective); break; + case s32: output = transform(in, tf, odims, method, inverse, perspective); break; + case u32: output = transform(in, tf, odims, method, inverse, perspective); break; + case s64: output = transform(in, tf, odims, method, inverse, perspective); break; + case u64: output = transform(in, tf, odims, method, inverse, perspective); break; + case s16: output = transform(in, tf, odims, method, inverse, perspective); break; + case u16: output = transform(in, tf, odims, method, inverse, perspective); break; + case u8: output = transform(in, tf, odims, method, inverse, perspective); break; + case b8: output = transform(in, tf, odims, method, inverse, perspective); break; default: TYPE_ERROR(1, itype); } std::swap(*out,output); From 2a438713f9c6a42737a98722553147f1ed0b55bd Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 11:27:22 -0500 Subject: [PATCH 101/288] Added perspective transform to CPU backend --- src/backend/cpu/rotate.cpp | 4 +- src/backend/cpu/transform.cpp | 72 +++++++++++++++++-------- src/backend/cpu/transform.hpp | 2 +- src/backend/cpu/transform_interp.hpp | 78 ++++++++++++++++++++-------- 4 files changed, 110 insertions(+), 46 deletions(-) diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp index a4af64b669..9756323676 100644 --- a/src/backend/cpu/rotate.cpp +++ b/src/backend/cpu/rotate.cpp @@ -25,7 +25,7 @@ namespace cpu void (*t_fn)(T *, const T *, const float *, const af::dim4 &, const af::dim4 &, const af::dim4 &, - const dim_t, const dim_t, const dim_t, const dim_t); + const dim_t, const dim_t, const dim_t, const dim_t, const bool); const float c = cos(-theta), s = sin(-theta); float tx, ty; @@ -67,7 +67,7 @@ namespace cpu // Do transform for image for(int yy = 0; yy < (int)odims[1]; yy++) { for(int xx = 0; xx < (int)odims[0]; xx++) { - t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy); + t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy, false); } } } diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp index 68e8d96eba..bf072c3aaf 100644 --- a/src/backend/cpu/transform.cpp +++ b/src/backend/cpu/transform.cpp @@ -17,30 +17,52 @@ namespace cpu { template - void calc_affine_inverse(T *txo, const T *txi) + void calc_transform_inverse(T *txo, const T *txi, const bool perspective) { - T det = txi[0]*txi[4] - txi[1]*txi[3]; + if (perspective) { + txo[0] = txi[4]*txi[8] - txi[5]*txi[7]; + txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]); + txo[2] = txi[1]*txi[5] - txi[2]*txi[4]; - txo[0] = txi[4] / det; - txo[1] = txi[3] / det; - txo[3] = txi[1] / det; - txo[4] = txi[0] / det; + txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]); + txo[4] = txi[0]*txi[8] - txi[2]*txi[6]; + txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]); - txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; - txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; + txo[6] = txi[3]*txi[7] - txi[4]*txi[6]; + txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]); + txo[8] = txi[0]*txi[4] - txi[1]*txi[3]; + + T det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6]; + + txo[0] /= det; txo[1] /= det; txo[2] /= det; + txo[3] /= det; txo[4] /= det; txo[5] /= det; + txo[6] /= det; txo[7] /= det; txo[8] /= det; + } + else { + T det = txi[0]*txi[4] - txi[1]*txi[3]; + + txo[0] = txi[4] / det; + txo[1] = txi[3] / det; + txo[3] = txi[1] / det; + txo[4] = txi[0] / det; + + txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; + txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; + } } template - void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse) + void calc_transform_inverse(T *tmat, const T *tmat_ptr, const bool inverse, + const bool perspective, const unsigned transf_len) { // The way kernel is structured, it expects an inverse // transform matrix by default. // If it is an forward transform, then we need its inverse if(inverse) { - for(int i = 0; i < 6; i++) + for(int i = 0; i < (int)transf_len; i++) tmat[i] = tmat_ptr[i]; } else { - calc_affine_inverse(tmat, tmat_ptr); + calc_transform_inverse(tmat, tmat_ptr, perspective); } } @@ -48,7 +70,8 @@ namespace cpu void transform_(T *out, const T *in, const float *tf, const af::dim4 &odims, const af::dim4 &idims, const af::dim4 &ostrides, const af::dim4 &istrides, - const af::dim4 &tstrides, const bool inverse) + const af::dim4 &tstrides, const bool inverse, + const bool perspective) { dim_t nimages = idims[2]; // Multiplied in src/backend/transform.cpp @@ -56,7 +79,7 @@ namespace cpu void (*t_fn)(T *, const T *, const float *, const af::dim4 &, const af::dim4 &, const af::dim4 &, - const dim_t, const dim_t, const dim_t, const dim_t); + const dim_t, const dim_t, const dim_t, const dim_t, const bool); switch(method) { case AF_INTERP_NEAREST: @@ -73,13 +96,14 @@ namespace cpu break; } + const int transf_len = (perspective) ? 9 : 6; // For each transform channel for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) { // Compute inverse if required - const float *tmat_ptr = tf + t_idx * 6; - float tmat[6]; - calc_affine_inverse(tmat, tmat_ptr, inverse); + const float *tmat_ptr = tf + t_idx * transf_len; + float* tmat = new float[transf_len]; + calc_transform_inverse(tmat, tmat_ptr, inverse, perspective, transf_len); // Offset for output pointer dim_t o_offset = t_idx * nimages * ostrides[2]; @@ -87,15 +111,16 @@ namespace cpu // Do transform for image for(int yy = 0; yy < (int)odims[1]; yy++) { for(int xx = 0; xx < (int)odims[0]; xx++) { - t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy); + t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy, perspective); } } + delete[] tmat; } } template Array transform(const Array &in, const Array &transform, const af::dim4 &odims, - const af_interp_type method, const bool inverse) + const af_interp_type method, const bool inverse, const bool perspective) { const af::dim4 idims = in.dims(); @@ -105,17 +130,20 @@ namespace cpu case AF_INTERP_NEAREST: transform_ (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); + out.strides(), in.strides(), transform.strides(), inverse, + perspective); break; case AF_INTERP_BILINEAR: transform_ (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); + out.strides(), in.strides(), transform.strides(), inverse, + perspective); break; case AF_INTERP_LOWER: transform_ (out.get(), in.get(), transform.get(), odims, idims, - out.strides(), in.strides(), transform.strides(), inverse); + out.strides(), in.strides(), transform.strides(), inverse, + perspective); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); @@ -129,7 +157,7 @@ namespace cpu #define INSTANTIATE(T) \ template Array transform(const Array &in, const Array &transform, \ const af::dim4 &odims, const af_interp_type method, \ - const bool inverse); + const bool inverse, const bool perspective); INSTANTIATE(float) diff --git a/src/backend/cpu/transform.hpp b/src/backend/cpu/transform.hpp index f9e730b1d4..ad4ebba5c3 100644 --- a/src/backend/cpu/transform.hpp +++ b/src/backend/cpu/transform.hpp @@ -14,5 +14,5 @@ namespace cpu { template Array transform(const Array &in, const Array &tf, const af::dim4 &odims, - const af_interp_type method, const bool inverse); + const af_interp_type method, const bool inverse, const bool perspective); } diff --git a/src/backend/cpu/transform_interp.hpp b/src/backend/cpu/transform_interp.hpp index 5ad47507b2..dacd2e9a93 100644 --- a/src/backend/cpu/transform_interp.hpp +++ b/src/backend/cpu/transform_interp.hpp @@ -27,15 +27,27 @@ namespace cpu void transform_n(T *out, const T *in, const float *tmat, const af::dim4 &idims, const af::dim4 &ostrides, const af::dim4 &istrides, const dim_t nimages, const dim_t o_offset, - const dim_t xx, const dim_t yy) + const dim_t xx, const dim_t yy, const bool perspective) { + dim_t yi = 0, xi = 0; // Compute output index - const dim_t xi = round(xx * tmat[0] - + yy * tmat[1] - + tmat[2]); - const dim_t yi = round(xx * tmat[3] - + yy * tmat[4] - + tmat[5]); + if (perspective) { + const float W = xx * tmat[6] + yy * tmat[7] + tmat[8]; + xi = round((xx * tmat[0] + + yy * tmat[1] + + tmat[2]) / W); + yi = round((xx * tmat[3] + + yy * tmat[4] + + tmat[5]) / W); + } + else { + xi = round(xx * tmat[0] + + yy * tmat[1] + + tmat[2]); + yi = round(xx * tmat[3] + + yy * tmat[4] + + tmat[5]); + } // Compute memory location of indices dim_t loci = (yi * istrides[1] + xi); @@ -62,16 +74,28 @@ namespace cpu void transform_b(T *out, const T *in, const float *tmat, const af::dim4 &idims, const af::dim4 &ostrides, const af::dim4 &istrides, const dim_t nimages, const dim_t o_offset, - const dim_t xx, const dim_t yy) + const dim_t xx, const dim_t yy, const bool perspective) { dim_t loco = (yy * ostrides[1] + xx); // Compute input index - const float xi = xx * tmat[0] - + yy * tmat[1] - + tmat[2]; - const float yi = xx * tmat[3] - + yy * tmat[4] - + tmat[5]; + float xi = 0.0f, yi = 0.0f; + if (perspective) { + const float W = xx * tmat[6] + yy * tmat[7] + tmat[8]; + xi = (xx * tmat[0] + + yy * tmat[1] + + tmat[2]) / W; + yi = (xx * tmat[3] + + yy * tmat[4] + + tmat[5]) / W; + } + else { + xi = xx * tmat[0] + + yy * tmat[1] + + tmat[2]; + yi = xx * tmat[3] + + yy * tmat[4] + + tmat[5]; + } if (xi < -0.0001 || yi < -0.0001 || idims[0] < xi || idims[1] < yi) { for(int i_idx = 0; i_idx < (int)nimages; i_idx++) { @@ -126,15 +150,27 @@ namespace cpu void transform_l(T *out, const T *in, const float *tmat, const af::dim4 &idims, const af::dim4 &ostrides, const af::dim4 &istrides, const dim_t nimages, const dim_t o_offset, - const dim_t xx, const dim_t yy) + const dim_t xx, const dim_t yy, const bool perspective) { // Compute output index - const dim_t xi = floor(xx * tmat[0] - + yy * tmat[1] - + tmat[2]); - const dim_t yi = floor(xx * tmat[3] - + yy * tmat[4] - + tmat[5]); + dim_t xi = 0, yi = 0; + if (perspective) { + const float W = xx * tmat[6] + yy * tmat[7] + tmat[8]; + xi = floor((xx * tmat[0] + + yy * tmat[1] + + tmat[2]) / W); + yi = floor((xx * tmat[3] + + yy * tmat[4] + + tmat[5]) / W); + } + else { + xi = floor(xx * tmat[0] + + yy * tmat[1] + + tmat[2]); + yi = floor(xx * tmat[3] + + yy * tmat[4] + + tmat[5]); + } // Compute memory location of indices dim_t loci = (yi * istrides[1] + xi); From 7fdfe3e6437b507d11290a76d02bc2801f6a9663 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 11:28:48 -0500 Subject: [PATCH 102/288] Added perspective transform to CUDA backend --- src/backend/cuda/kernel/rotate.hpp | 6 +- src/backend/cuda/kernel/transform.hpp | 79 +++++++++++++------- src/backend/cuda/kernel/transform_interp.hpp | 65 ++++++++++++---- src/backend/cuda/transform.cu | 10 +-- src/backend/cuda/transform.hpp | 3 +- 5 files changed, 116 insertions(+), 47 deletions(-) diff --git a/src/backend/cuda/kernel/rotate.hpp b/src/backend/cuda/kernel/rotate.hpp index d63f010c3b..3cea7f2698 100644 --- a/src/backend/cuda/kernel/rotate.hpp +++ b/src/backend/cuda/kernel/rotate.hpp @@ -60,11 +60,11 @@ namespace cuda switch(method) { case AF_INTERP_NEAREST: - transform_n(optr, out, iptr, in, t.tmat, xx, yy, limages); break; + transform_n(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break; case AF_INTERP_BILINEAR: - transform_b(optr, out, iptr, in, t.tmat, xx, yy, limages); break; + transform_b(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break; case AF_INTERP_LOWER: - transform_l(optr, out, iptr, in, t.tmat, xx, yy, limages); break; + transform_l(optr, out, iptr, in, t.tmat, xx, yy, limages, false); break; default: break; } } diff --git a/src/backend/cuda/kernel/transform.hpp b/src/backend/cuda/kernel/transform.hpp index 07be0a35b3..599e62cf9d 100644 --- a/src/backend/cuda/kernel/transform.hpp +++ b/src/backend/cuda/kernel/transform.hpp @@ -24,21 +24,42 @@ namespace cuda // Used for batching images static const unsigned TI = 4; - __constant__ float c_tmat[6 * 256]; + __constant__ float c_tmat[9 * 256]; template __host__ __device__ - void calc_affine_inverse(T *txo, const T *txi) + void calc_transf_inverse(T *txo, const T *txi, const bool perspective) { - T det = txi[0]*txi[4] - txi[1]*txi[3]; - - txo[0] = txi[4] / det; - txo[1] = txi[3] / det; - txo[3] = txi[1] / det; - txo[4] = txi[0] / det; - - txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; - txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; + if (perspective) { + txo[0] = txi[4]*txi[8] - txi[5]*txi[7]; + txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]); + txo[2] = txi[1]*txi[5] - txi[2]*txi[4]; + + txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]); + txo[4] = txi[0]*txi[8] - txi[2]*txi[6]; + txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]); + + txo[6] = txi[3]*txi[7] - txi[4]*txi[6]; + txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]); + txo[8] = txi[0]*txi[4] - txi[1]*txi[3]; + + T det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6]; + + txo[0] /= det; txo[1] /= det; txo[2] /= det; + txo[3] /= det; txo[4] /= det; txo[5] /= det; + txo[6] /= det; txo[7] /= det; txo[8] /= det; + } + else { + T det = txi[0]*txi[4] - txi[1]*txi[3]; + + txo[0] = txi[4] / det; + txo[1] = txi[3] / det; + txo[3] = txi[1] / det; + txo[4] = txi[0] / det; + + txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; + txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; + } } /////////////////////////////////////////////////////////////////////////// @@ -47,7 +68,8 @@ namespace cuda template __global__ static void transform_kernel(Param out, CParam in, const int nimages, - const int ntransforms, const int blocksXPerImage) + const int ntransforms, const int blocksXPerImage, + const int transf_len, const bool perspective) { // Compute which image set const int setId = blockIdx.x / blocksXPerImage; @@ -77,30 +99,32 @@ namespace cuda const T *iptr = in.ptr + setId * nimages * in.strides[2]; // Transform is in constant memory. - const float *tmat_ptr = c_tmat + t_idx * 6; - float tmat[6]; + const float *tmat_ptr = c_tmat + t_idx * transf_len; + float* tmat = new float[transf_len]; // We expect a inverse transform matrix by default // If it is an forward transform, then we need its inverse if(inverse) { - #pragma unroll - for(int i = 0; i < 6; i++) + #pragma unroll 3 + for(int i = 0; i < transf_len; i++) tmat[i] = tmat_ptr[i]; } else { - calc_affine_inverse(tmat, tmat_ptr); + calc_transf_inverse(tmat, tmat_ptr, perspective); } if (xido >= out.dims[0] && yido >= out.dims[1]) return; switch(method) { case AF_INTERP_NEAREST: - transform_n(optr, out, iptr, in, tmat, xido, yido, limages); break; + transform_n(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break; case AF_INTERP_BILINEAR: - transform_b(optr, out, iptr, in, tmat, xido, yido, limages); break; + transform_b(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break; case AF_INTERP_LOWER: - transform_l(optr, out, iptr, in, tmat, xido, yido, limages); break; + transform_l(optr, out, iptr, in, tmat, xido, yido, limages, perspective); break; default: break; } + + delete[] tmat; } /////////////////////////////////////////////////////////////////////////// @@ -108,15 +132,18 @@ namespace cuda /////////////////////////////////////////////////////////////////////////// template void transform(Param out, CParam in, CParam tf, - const bool inverse) + const bool inverse, const bool perspective) { int nimages = in.dims[2]; // Multiplied in src/backend/transform.cpp const int ntransforms = out.dims[2] / in.dims[2]; + + const int transf_len = (perspective) ? 9 : 6; + // Copy transform to constant memory. - CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0, - cudaMemcpyDeviceToDevice, + CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * transf_len * sizeof(float), + 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); dim3 threads(TX, TY, 1); @@ -133,10 +160,12 @@ namespace cuda if(inverse) { CUDA_LAUNCH((transform_kernel), blocks, threads, - out, in, nimages, ntransforms, blocksXPerImage); + out, in, nimages, ntransforms, blocksXPerImage, + transf_len, perspective); } else { CUDA_LAUNCH((transform_kernel), blocks, threads, - out, in, nimages, ntransforms, blocksXPerImage); + out, in, nimages, ntransforms, blocksXPerImage, + transf_len, perspective); } POST_LAUNCH_CHECK(); } diff --git a/src/backend/cuda/kernel/transform_interp.hpp b/src/backend/cuda/kernel/transform_interp.hpp index 5a88fc4d76..1554b8ec62 100644 --- a/src/backend/cuda/kernel/transform_interp.hpp +++ b/src/backend/cuda/kernel/transform_interp.hpp @@ -42,15 +42,28 @@ namespace cuda template __device__ void transform_n(T *optr, Param out, const T *iptr, CParam in, const float *tmat, - const int xido, const int yido, const int nimages) + const int xido, const int yido, const int nimages, + const bool perspective) { // Compute input index - int xidi = round(xido * tmat[0] + int xidi = 0, yidi = 0; + if (perspective) { + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = round((xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W); + yidi = round((xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W); + } + else { + xidi = round(xido * tmat[0] + yido * tmat[1] + tmat[2]); - int yidi = round(xido * tmat[3] + yidi = round(xido * tmat[3] + yido * tmat[4] + tmat[5]); + } // Makes scale give same output as resize // But fails rotate tests @@ -76,17 +89,30 @@ namespace cuda template __device__ void transform_b(T *optr, Param out, const T *iptr, CParam in, const float *tmat, - const int xido, const int yido, const int nimages) + const int xido, const int yido, const int nimages, + const bool perspective) { const int loco = (yido * out.strides[1] + xido); // Compute input index - const float xidi = xido * tmat[0] - + yido * tmat[1] - + tmat[2]; - const float yidi = xido * tmat[3] - + yido * tmat[4] - + tmat[5]; + float xidi = 0.0f, yidi = 0.0f; + if (perspective) { + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = (xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W; + yidi = (xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W; + } + else { + xidi = xido * tmat[0] + + yido * tmat[1] + + tmat[2]; + yidi = xido * tmat[3] + + yido * tmat[4] + + tmat[5]; + } if (xidi < -0.0001 || yidi < -0.0001 || in.dims[0] < xidi || in.dims[1] < yidi) { for(int i = 0; i < nimages; i++) { @@ -133,15 +159,28 @@ namespace cuda template __device__ void transform_l(T *optr, Param out, const T *iptr, CParam in, const float *tmat, - const int xido, const int yido, const int nimages) + const int xido, const int yido, const int nimages, + const bool perspective) { // Compute input index - int xidi = floor(xido * tmat[0] + int xidi = 0, yidi = 0; + if (perspective) { + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = floor((xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W); + yidi = floor((xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W); + } + else { + xidi = floor(xido * tmat[0] + yido * tmat[1] + tmat[2]); - int yidi = floor(xido * tmat[3] + yidi = floor(xido * tmat[3] + yido * tmat[4] + tmat[5]); + } // Makes scale give same output as resize // But fails rotate tests diff --git a/src/backend/cuda/transform.cu b/src/backend/cuda/transform.cu index 853617c0a4..07c312353c 100644 --- a/src/backend/cuda/transform.cu +++ b/src/backend/cuda/transform.cu @@ -16,7 +16,7 @@ namespace cuda { template Array transform(const Array &in, const Array &transform, const af::dim4 &odims, - const af_interp_type method, const bool inverse) + const af_interp_type method, const bool inverse, const bool perspective) { const af::dim4 idims = in.dims(); @@ -24,13 +24,13 @@ namespace cuda switch(method) { case AF_INTERP_NEAREST: - kernel::transform (out, in, transform, inverse); + kernel::transform (out, in, transform, inverse, perspective); break; case AF_INTERP_BILINEAR: - kernel::transform(out, in, transform, inverse); + kernel::transform(out, in, transform, inverse, perspective); break; case AF_INTERP_LOWER: - kernel::transform (out, in, transform, inverse); + kernel::transform (out, in, transform, inverse, perspective); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); @@ -43,7 +43,7 @@ namespace cuda #define INSTANTIATE(T) \ template Array transform(const Array &in, const Array &transform, \ const af::dim4 &odims, const af_interp_type method, \ - const bool inverse); + const bool inverse, const bool perspective); INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/backend/cuda/transform.hpp b/src/backend/cuda/transform.hpp index eb3d71d097..316953d614 100644 --- a/src/backend/cuda/transform.hpp +++ b/src/backend/cuda/transform.hpp @@ -14,5 +14,6 @@ namespace cuda { template Array transform(const Array &in, const Array &tf, const af::dim4 &odims, - const af_interp_type method, const bool inverse); + const af_interp_type method, const bool inverse, + const bool perspective); } From 15b9ad6ae46e76bb086a6bf136ffce0bf147a8b0 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 11:30:59 -0500 Subject: [PATCH 103/288] Added perspective transform to OpenCL backend --- src/backend/opencl/kernel/transform.cl | 32 ++++- src/backend/opencl/kernel/transform.hpp | 14 ++- src/backend/opencl/kernel/transform_interp.cl | 69 ++++++++--- src/backend/opencl/transform.cpp | 110 ++++++++++++------ src/backend/opencl/transform.hpp | 2 +- 5 files changed, 161 insertions(+), 66 deletions(-) diff --git a/src/backend/opencl/kernel/transform.cl b/src/backend/opencl/kernel/transform.cl index 824f50cc5d..d746525ed6 100644 --- a/src/backend/opencl/kernel/transform.cl +++ b/src/backend/opencl/kernel/transform.cl @@ -11,9 +11,28 @@ #define BILINEAR transform_b #define LOWER transform_l -void calc_affine_inverse(float* txo, __global const float* txi) +void calc_transf_inverse(float* txo, __global const float* txi) { - float det = txi[0]*txi[4] - txi[1]*txi[3]; +#if PERSPECTIVE + txo[0] = txi[4]*txi[8] - txi[5]*txi[7]; + txo[1] = -(txi[1]*txi[8] - txi[2]*txi[7]); + txo[2] = txi[1]*txi[5] - txi[2]*txi[4]; + + txo[3] = -(txi[3]*txi[8] - txi[5]*txi[6]); + txo[4] = txi[0]*txi[8] - txi[2]*txi[6]; + txo[5] = -(txi[0]*txi[5] - txi[2]*txi[3]); + + txo[6] = txi[3]*txi[7] - txi[4]*txi[6]; + txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]); + txo[8] = txi[0]*txi[4] - txi[1]*txi[3]; + + T det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6]; + + txo[0] /= det; txo[1] /= det; txo[2] /= det; + txo[3] /= det; txo[4] /= det; txo[5] /= det; + txo[6] /= det; txo[7] /= det; txo[8] /= det; +#else + T det = txi[0]*txi[4] - txi[1]*txi[3]; txo[0] = txi[4] / det; txo[1] = txi[3] / det; @@ -22,6 +41,7 @@ void calc_affine_inverse(float* txo, __global const float* txi) txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1]; txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4]; +#endif } __kernel @@ -59,17 +79,17 @@ void transform_kernel(__global T *d_out, const KParam out, // Transform is in global memory. // Needs offset to correct transform being processed. - __global const float *tmat_ptr = c_tmat + t_idx * 6; - float tmat[6]; + __global const float *tmat_ptr = c_tmat + t_idx * TRANSF_LEN; + float tmat[TRANSF_LEN]; // We expect a inverse transform matrix by default // If it is an forward transform, then we need its inverse if(INVERSE == 1) { #pragma unroll - for(int i = 0; i < 6; i++) + for(int i = 0; i < TRANSF_LEN; i++) tmat[i] = tmat_ptr[i]; } else { - calc_affine_inverse(tmat, tmat_ptr); + calc_transf_inverse(tmat, tmat_ptr); } if (xido >= out.dims[0] && yido >= out.dims[1]) return; diff --git a/src/backend/opencl/kernel/transform.hpp b/src/backend/opencl/kernel/transform.hpp index 677acc31fe..f78c7b0ebe 100644 --- a/src/backend/opencl/kernel/transform.hpp +++ b/src/backend/opencl/kernel/transform.hpp @@ -50,7 +50,7 @@ namespace opencl >::type; - template + template void transform(Param out, const Param in, const Param tf) { try { @@ -64,11 +64,13 @@ namespace opencl std::call_once( compileFlags[device], [device] () { ToNum toNum; std::ostringstream options; - options << " -D T=" << dtype_traits::getName() - << " -D INVERSE=" << (isInverse ? 1 : 0) - << " -D ZERO=" << toNum(scalar(0)); - options << " -D VT=" << dtype_traits>::getName(); - options << " -D WT=" << dtype_traits>::getName(); + options << " -D T=" << dtype_traits::getName() + << " -D INVERSE=" << (isInverse ? 1 : 0) + << " -D PERSPECTIVE=" << (isPerspective ? 1 : 0) + << " -D TRANSF_LEN=" << (isPerspective ? 9 : 6) + << " -D ZERO=" << toNum(scalar(0)); + options << " -D VT=" << dtype_traits>::getName(); + options << " -D WT=" << dtype_traits>::getName(); if((af_dtype) dtype_traits::af_type == c32 || (af_dtype) dtype_traits::af_type == c64) { diff --git a/src/backend/opencl/kernel/transform_interp.cl b/src/backend/opencl/kernel/transform_interp.cl index 1d82951b9d..a083df0ff6 100644 --- a/src/backend/opencl/kernel/transform_interp.cl +++ b/src/backend/opencl/kernel/transform_interp.cl @@ -25,12 +25,23 @@ void transform_n(__global T *d_out, const KParam out, __global const T *d_in, co const float *tmat, const int xido, const int yido, const int nimages) { // Compute input index - const int xidi = round(xido * tmat[0] - + yido * tmat[1] - + tmat[2]); - const int yidi = round(xido * tmat[3] - + yido * tmat[4] - + tmat[5]); + int xidi = 0, yidi = 0; +#if PERSPECTIVE + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = round((xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W); + yidi = round((xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W); +#else + xidi = round(xido * tmat[0] + + yido * tmat[1] + + tmat[2]); + yidi = round(xido * tmat[3] + + yido * tmat[4] + + tmat[5]); +#endif // Compute memory location of indices const int loci = yidi * in.strides[1] + xidi; @@ -54,12 +65,23 @@ void transform_b(__global T *d_out, const KParam out, __global const T *d_in, co const int loco = (yido * out.strides[1] + xido); // Compute input index - const float xid = xido * tmat[0] - + yido * tmat[1] - + tmat[2]; - const float yid = xido * tmat[3] - + yido * tmat[4] - + tmat[5]; + float xid = 0.0f, yid = 0.0f; +#if PERSPECTIVE + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xid = (xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W; + yid = (xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W; +#else + xid = xido * tmat[0] + + yido * tmat[1] + + tmat[2]; + yid = xido * tmat[3] + + yido * tmat[4] + + tmat[5]; +#endif T zero = ZERO; if (xid < -0.001 || yid < -0.001 || in.dims[0] < xid || in.dims[1] < yid) { @@ -104,12 +126,23 @@ void transform_l(__global T *d_out, const KParam out, __global const T *d_in, co const float *tmat, const int xido, const int yido, const int nimages) { // Compute input index - const int xidi = floor(xido * tmat[0] - + yido * tmat[1] - + tmat[2]); - const int yidi = floor(xido * tmat[3] - + yido * tmat[4] - + tmat[5]); + int xidi = 0, yidi = 0; +#if PERSPECTIVE + const float W = xido * tmat[6] + yido * tmat[7] + tmat[8]; + xidi = floor((xido * tmat[0] + + yido * tmat[1] + + tmat[2]) / W); + yidi = floor((xido * tmat[3] + + yido * tmat[4] + + tmat[5]) / W); +#else + xidi = floor(xido * tmat[0] + + yido * tmat[1] + + tmat[2]); + yidi = floor(xido * tmat[3] + + yido * tmat[4] + + tmat[5]); +#endif // Compute memory location of indices const int loci = yidi * in.strides[1] + xidi; diff --git a/src/backend/opencl/transform.cpp b/src/backend/opencl/transform.cpp index c8e2b69a8b..379fd2a5b7 100644 --- a/src/backend/opencl/transform.cpp +++ b/src/backend/opencl/transform.cpp @@ -18,46 +18,86 @@ namespace opencl { template Array transform(const Array &in, const Array &transform, - const af::dim4 &odims, - const af_interp_type method, const bool inverse) + const af::dim4 &odims, const af_interp_type method, + const bool inverse, const bool perspective) { Array out = createEmptyArray(odims); if(inverse) { - switch(method) { - case AF_INTERP_NEAREST: - kernel::transform - (out, in, transform); - break; - case AF_INTERP_BILINEAR: - kernel::transform - (out, in, transform); - break; - case AF_INTERP_LOWER: - kernel::transform - (out, in, transform); - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; + if (perspective) { + switch(method) { + case AF_INTERP_NEAREST: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_BILINEAR: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_LOWER: + kernel::transform + (out, in, transform); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } + } else { + switch(method) { + case AF_INTERP_NEAREST: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_BILINEAR: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_LOWER: + kernel::transform + (out, in, transform); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } } } else { - switch(method) { - case AF_INTERP_NEAREST: - kernel::transform - (out, in, transform); - break; - case AF_INTERP_BILINEAR: - kernel::transform - (out, in, transform); - break; - case AF_INTERP_LOWER: - kernel::transform - (out, in, transform); - break; - default: - AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); - break; + if (perspective) { + switch(method) { + case AF_INTERP_NEAREST: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_BILINEAR: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_LOWER: + kernel::transform + (out, in, transform); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } + } else { + switch(method) { + case AF_INTERP_NEAREST: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_BILINEAR: + kernel::transform + (out, in, transform); + break; + case AF_INTERP_LOWER: + kernel::transform + (out, in, transform); + break; + default: + AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); + break; + } } } @@ -68,7 +108,7 @@ namespace opencl #define INSTANTIATE(T) \ template Array transform(const Array &in, const Array &transform, \ const af::dim4 &odims, const af_interp_type method, \ - const bool inverse); + const bool inverse, const bool perspective); INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/backend/opencl/transform.hpp b/src/backend/opencl/transform.hpp index f0b4d4c955..064817a537 100644 --- a/src/backend/opencl/transform.hpp +++ b/src/backend/opencl/transform.hpp @@ -14,5 +14,5 @@ namespace opencl { template Array transform(const Array &in, const Array &tf, const af::dim4 &odims, - const af_interp_type method, const bool inverse); + const af_interp_type method, const bool inverse, const bool perspective); } From 81dca062d9691905c8f13291f3ccd6b66186859d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 11:36:31 -0500 Subject: [PATCH 104/288] Updated transform documentation --- docs/details/image.dox | 47 ++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/docs/details/image.dox b/docs/details/image.dox index 234f4f72e9..53ac7616fc 100644 --- a/docs/details/image.dox +++ b/docs/details/image.dox @@ -501,10 +501,12 @@ grad(dx, dy, in); Resize an input image -Resizing an input image can be done using either \ref AF_INTERP_NEAREST or -\ref AF_INTERP_BILINEAR interpolations. Nearest interpolation will pick the -nearest value to the location, whereas bilinear interpolation will do a -weighted interpolation for calculate the new size. +Resizing an input image can be done using either \ref AF_INTERP_NEAREST, +\ref AF_INTERP_BILINEAR or \ref AF_INTERP_LOWER, interpolations. Nearest +interpolation will pick the nearest value to the location, bilinear +interpolation will do a weighted interpolation for calculate the new size +and lower interpolation is similar to the nearest, except it will use the +floor function to get the lower neighbor. This function does not differentiate between images and data. As long as the array is defined and the output dimensions are not 0, it will resize any @@ -556,10 +558,10 @@ Rotate an input image The angle theta is in radians. -Rotating an input image can be done using either \ref AF_INTERP_NEAREST or -\ref AF_INTERP_BILINEAR interpolations. Nearest interpolation will pick the -nearest value to the location, whereas bilinear interpolation will do a -weighted interpolation for calculate the new size. +Rotating an input image can be done using \ref AF_INTERP_NEAREST, +\ref AF_INTERP_BILINEAR or \ref AF_INTERP_LOWER interpolations. Nearest +interpolation will pick the nearest value to the location, whereas bilinear +interpolation will do a weighted interpolation for calculate the new size. This function does not differentiate between images and data. As long as the array is defined, it will rotate any type or size of array. @@ -659,22 +661,35 @@ Skew is a special case of the \ref af::transform function. Transform an input image -The transform function uses an affine transform matrix to tranform an input +The transform function uses an affine or perspective transform matrix to tranform an input image into a new one. -The transform matrix \p tf is a 3x2 matrix of type float. The matrix operation -is applied to each location (x, y) that is then transformed to (x', y') of the +If matrix \p tf is is a 3x2 matrix, an affine transformation will be performed. The matrix +operation is applied to each location (x, y) that is then transformed to (x', y') of the new array. Hence the transformation is an element-wise operation. -The operation is as below: -tf = [r00 r10 - r01 r11 +The operation is as below:\n +tf = [r00 r10\n + r01 r11\n t0 t1] -x' = x * r00 + y * r01 + t0; +x' = x * r00 + y * r01 + t0;\n y' = x * r10 + y * r11 + t1; -Interpolation types of \ref AF_INTERP_NEAREST and \ref AF_INTERP_BILINEAR are allowed. +If matrix \p tf is is a 3x3 matrix, a perspective transformation will be performed. + +The operation is as below:\n +tf = [r00 r10 r20\n + r01 r11 r21\n + t0 t1 t2] + +x' = (x * r00 + y * r01 + t0) / (x * r20 + y * r21 + t2);\n +y' = (x * r10 + y * r11 + t1) / (x * r20 + y * r21 + t2); + +The transformation matrix \p tf should always be of type f32. + +Interpolation types of \ref AF_INTERP_NEAREST, \ref AF_INTERP_BILINEAR and +AF_INTERP_LOWER are allowed. Affine transforms can be used for various purposes. \ref af::translate, \ref af::scale and \ref af::skew are specializations of the transform function. From 8e4e766b717e85cd7c7b477bf94e9dd249d1e037 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 12:33:46 -0500 Subject: [PATCH 105/288] Added perspective transform unit tests --- test/transform.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 test/transform.cpp diff --git a/test/transform.cpp b/test/transform.cpp new file mode 100644 index 0000000000..fa0006cbf2 --- /dev/null +++ b/test/transform.cpp @@ -0,0 +1,267 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +using std::vector; +using std::string; +using std::cout; +using std::endl; + +template +class Transform : public ::testing::Test +{ + public: + virtual void SetUp() {} +}; + +template +class TransformInt : public ::testing::Test +{ + public: + virtual void SetUp() { + } +}; + +typedef ::testing::Types TestTypes; +typedef ::testing::Types TestTypesInt; + +TYPED_TEST_CASE(Transform, TestTypes); +TYPED_TEST_CASE(TransformInt, TestTypesInt); + +template +void transformTest(string pTestFile, string pHomographyFile, const af_interp_type method, const bool invert) +{ + if (noDoubleTests()) return; + + vector inNumDims; + vector inFiles; + vector goldNumDims; + vector goldFiles; + + readImageTests(pTestFile, inNumDims, inFiles, goldNumDims, goldFiles); + + inFiles[0].insert(0,string(TEST_DIR"/transform/")); + inFiles[1].insert(0,string(TEST_DIR"/transform/")); + goldFiles[0].insert(0,string(TEST_DIR"/transform/")); + + af::dim4 objDims = inNumDims[0]; + + vector HNumDims; + vector > HIn; + vector > HTests; + readTests(pHomographyFile, HNumDims, HIn, HTests); + + af::dim4 HDims = HNumDims[0]; + + af_array sceneArray_f32 = 0; + af_array goldArray_f32 = 0; + af_array outArray_f32 = 0; + af_array sceneArray = 0; + af_array goldArray = 0; + af_array outArray = 0; + af_array HArray = 0; + + ASSERT_EQ(AF_SUCCESS, af_load_image(&sceneArray_f32, inFiles[1].c_str(), false)); + ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray_f32, goldFiles[0].c_str(), false)); + + ASSERT_EQ(AF_SUCCESS, conv_image(&sceneArray, sceneArray_f32)); + ASSERT_EQ(AF_SUCCESS, conv_image(&goldArray, goldArray_f32)); + + ASSERT_EQ(AF_SUCCESS, af_create_array(&HArray, &(HIn[0].front()), HDims.ndims(), HDims.get(), f32)); + + ASSERT_EQ(AF_SUCCESS, af_transform(&outArray, sceneArray, HArray, objDims[0], objDims[1], method, invert)); + + // Get gold data + dim_t goldEl = 0; + ASSERT_EQ(AF_SUCCESS, af_get_elements(&goldEl, goldArray)); + T* goldData = new T[goldEl]; + ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)goldData, goldArray)); + + // Get result + dim_t outEl = 0; + ASSERT_EQ(AF_SUCCESS, af_get_elements(&outEl, outArray)); + T* outData = new T[outEl]; + ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray)); + + const float thr = 1.1f; + + // Maximum number of wrong pixels must be <= 0.01% of number of elements, + // this metric is necessary due to rounding errors between different + // backends for AF_INTERP_NEAREST and AF_INTERP_LOWER + const size_t maxErr = goldEl * 0.0001f; + size_t err = 0; + + for (dim_t elIter = 0; elIter < goldEl; elIter++) { + err += fabs((float)floor(outData[elIter]) - (float)floor(goldData[elIter])) > thr; + if (err > maxErr) + ASSERT_LE(err, maxErr) << "at: " << elIter << std::endl; + } + + delete[] goldData; + delete[] outData; + + if(sceneArray_f32 != 0) af_release_array(sceneArray_f32); + if(goldArray_f32 != 0) af_release_array(goldArray_f32); + if(outArray_f32 != 0) af_release_array(outArray_f32); + if(sceneArray != 0) af_release_array(sceneArray); + if(goldArray != 0) af_release_array(goldArray); + if(outArray != 0) af_release_array(outArray); + if(HArray != 0) af_release_array(HArray); +} + +TYPED_TEST(Transform, PerspectiveNearest) +{ + transformTest(string(TEST_DIR"/transform/tux_nearest.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_NEAREST, false); +} + +TYPED_TEST(Transform, PerspectiveBilinear) +{ + transformTest(string(TEST_DIR"/transform/tux_bilinear.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_BILINEAR, false); +} + +TYPED_TEST(Transform, PerspectiveLower) +{ + transformTest(string(TEST_DIR"/transform/tux_lower.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_LOWER, false); +} + +TYPED_TEST(Transform, PerspectiveNearestInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_nearest.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_NEAREST, true); +} + +TYPED_TEST(Transform, PerspectiveBilinearInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_bilinear.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_BILINEAR, true); +} + +TYPED_TEST(Transform, PerspectiveLowerInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_lower.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_LOWER, true); +} + +TYPED_TEST(TransformInt, PerspectiveNearest) +{ + transformTest(string(TEST_DIR"/transform/tux_nearest.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_NEAREST, false); +} + +TYPED_TEST(TransformInt, PerspectiveBilinear) +{ + transformTest(string(TEST_DIR"/transform/tux_bilinear.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_BILINEAR, false); +} + +TYPED_TEST(TransformInt, PerspectiveLower) +{ + transformTest(string(TEST_DIR"/transform/tux_lower.test"), + string(TEST_DIR"/transform/tux_tmat.test"), + AF_INTERP_LOWER, false); +} + +TYPED_TEST(TransformInt, PerspectiveNearestInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_nearest.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_NEAREST, true); +} + +TYPED_TEST(TransformInt, PerspectiveBilinearInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_bilinear.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_BILINEAR, true); +} + +TYPED_TEST(TransformInt, PerspectiveLowerInvert) +{ + transformTest(string(TEST_DIR"/transform/tux_lower.test"), + string(TEST_DIR"/transform/tux_tmat_inverse.test"), + AF_INTERP_LOWER, true); +} + + +///////////////////////////////////// CPP //////////////////////////////// +// +TEST(Transform, CPP) +{ + vector inDims; + vector inFiles; + vector goldDim; + vector goldFiles; + + vector HDims; + vector > HIn; + vector > HTests; + readTests(TEST_DIR"/transform/tux_tmat.test",HDims,HIn,HTests); + + readImageTests(string(TEST_DIR"/transform/tux_nearest.test"), inDims, inFiles, goldDim, goldFiles); + + inFiles[0].insert(0,string(TEST_DIR"/transform/")); + inFiles[1].insert(0,string(TEST_DIR"/transform/")); + + goldFiles[0].insert(0,string(TEST_DIR"/transform/")); + + af::array H = af::array(HDims[0][0], HDims[0][1], &(HIn[0].front())); + af::array IH = af::array(HDims[0][0], HDims[0][1], &(HIn[0].front())); + + af::array scene_img = af::loadImage(inFiles[1].c_str(), false); + + af::array gold_img = af::loadImage(goldFiles[0].c_str(), false); + + af::array out_img = af::transform(scene_img, IH, inDims[0][0], inDims[0][1], AF_INTERP_NEAREST, false); + + af::dim4 outDims = out_img.dims(); + af::dim4 goldDims = gold_img.dims(); + + float* h_out_img = new float[outDims[0] * outDims[1]]; + out_img.host(h_out_img); + float* h_gold_img = new float[goldDims[0] * goldDims[1]]; + gold_img.host(h_gold_img); + + const dim_t n = gold_img.elements(); + + const float thr = 1.0f; + + // Maximum number of wrong pixels must be <= 0.01% of number of elements, + // this metric is necessary due to rounding errors between different + // backends for AF_INTERP_NEAREST and AF_INTERP_LOWER + const size_t maxErr = n * 0.0001f; + size_t err = 0; + + for (dim_t elIter = 0; elIter < n; elIter++) { + err += fabs((int)h_out_img[elIter] - h_gold_img[elIter]) > thr; + if (err > maxErr) + ASSERT_LE(err, maxErr) << "at: " << elIter << std::endl; + } + + delete[] h_gold_img; + delete[] h_out_img; +} From 7327fb24b176f4ca3a3d3fe4835c5af0e56f5c95 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 12:34:28 -0500 Subject: [PATCH 106/288] Updated test data --- test/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data b/test/data index db4f6e8062..4a735db351 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit db4f6e80629fb41580ab93208db6b8be958871df +Subproject commit 4a735db3515db3f8f914e0b69fa2e11add9cd50f From 27aeed060aa87e03b69f31e14558c468e2d9dcc9 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 15:04:43 -0500 Subject: [PATCH 107/288] Fixed wrong data type in OpenCL transform --- src/backend/opencl/kernel/transform.cl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/opencl/kernel/transform.cl b/src/backend/opencl/kernel/transform.cl index d746525ed6..c44c18457a 100644 --- a/src/backend/opencl/kernel/transform.cl +++ b/src/backend/opencl/kernel/transform.cl @@ -26,13 +26,13 @@ void calc_transf_inverse(float* txo, __global const float* txi) txo[7] = -(txi[0]*txi[7] - txi[1]*txi[6]); txo[8] = txi[0]*txi[4] - txi[1]*txi[3]; - T det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6]; + float det = txi[0]*txo[0] + txi[1]*txo[3] + txi[2]*txo[6]; txo[0] /= det; txo[1] /= det; txo[2] /= det; txo[3] /= det; txo[4] /= det; txo[5] /= det; txo[6] /= det; txo[7] /= det; txo[8] /= det; #else - T det = txi[0]*txi[4] - txi[1]*txi[3]; + float det = txi[0]*txi[4] - txi[1]*txi[3]; txo[0] = txi[4] / det; txo[1] = txi[3] / det; @@ -85,7 +85,7 @@ void transform_kernel(__global T *d_out, const KParam out, // We expect a inverse transform matrix by default // If it is an forward transform, then we need its inverse if(INVERSE == 1) { - #pragma unroll + #pragma unroll 3 for(int i = 0; i < TRANSF_LEN; i++) tmat[i] = tmat_ptr[i]; } else { From bdc31d04b810958cf714ae6fefd21ac76edaa861 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 29 Dec 2015 15:08:44 -0500 Subject: [PATCH 108/288] Simplified test for perspective transform in API --- src/api/c/transform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/c/transform.cpp b/src/api/c/transform.cpp index ffd86dcd58..785a05438e 100644 --- a/src/api/c/transform.cpp +++ b/src/api/c/transform.cpp @@ -45,7 +45,7 @@ af_err af_transform(af_array *out, const af_array in, const af_array tf, DIM_ASSERT(1, idims.elements() > 0); DIM_ASSERT(1, (idims.ndims() == 2 || idims.ndims() == 3)); - const bool perspective = (tdims[1] == 3) ? true : false; + const bool perspective = (tdims[1] == 3); dim_t o0 = odim0, o1 = odim1; dim_t o2 = idims[2] * tdims[2]; From 1cbffbbbca2aa94b6c01daed167b36534831273a Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Wed, 30 Dec 2015 10:09:15 -0500 Subject: [PATCH 109/288] Synchronize when AF_SYNCHRONOUS_CALLS is set to 1 --- .gitignore | 1 + CMakeLists.txt | 1 + src/api/unified/CMakeLists.txt | 1 + src/api/unified/symbol_manager.cpp | 19 ------------ src/api/unified/symbol_manager.hpp | 2 ++ src/backend/cpu/Array.hpp | 2 +- src/backend/cpu/CMakeLists.txt | 1 + src/backend/cpu/debug_cpu.hpp | 2 +- src/backend/cpu/platform.cpp | 6 ++-- src/backend/cpu/platform.hpp | 6 ++-- src/backend/cpu/queue.hpp | 42 ++++++++++++++++++++++++++ src/backend/cuda/CMakeLists.txt | 1 + src/backend/cuda/debug_cuda.hpp | 10 ++++-- src/backend/cuda/platform.cpp | 6 ++++ src/backend/cuda/platform.hpp | 3 ++ src/backend/opencl/CMakeLists.txt | 1 + src/backend/opencl/debug_opencl.hpp | 7 ++++- src/backend/opencl/kernel/convolve.hpp | 1 + src/backend/opencl/platform.cpp | 6 ++++ src/backend/opencl/platform.hpp | 2 ++ src/util.cpp | 40 ++++++++++++++++++++++++ src/util.hpp | 16 ++++++++++ 22 files changed, 146 insertions(+), 30 deletions(-) create mode 100644 src/backend/cpu/queue.hpp create mode 100644 src/util.cpp create mode 100644 src/util.hpp diff --git a/.gitignore b/.gitignore index 948b5962eb..d032d3d5dd 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ GPATH include/af/version.h src/backend/version.hpp docs/details/examples.dox +/TAGS diff --git a/CMakeLists.txt b/CMakeLists.txt index ea92cbec5d..fda27036fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,6 +110,7 @@ IF(BUILD_SIFT) ENDIF(BUILD_SIFT) INCLUDE_DIRECTORIES( + "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src/backend" "${CMAKE_CURRENT_SOURCE_DIR}/src/api/c" diff --git a/src/api/unified/CMakeLists.txt b/src/api/unified/CMakeLists.txt index a4843bb49c..b6980d6bb3 100644 --- a/src/api/unified/CMakeLists.txt +++ b/src/api/unified/CMakeLists.txt @@ -15,6 +15,7 @@ FILE(GLOB cpp_sources SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources}) FILE(GLOB common_sources + "../../util.cpp" "../c/util.cpp" "../c/err_common.cpp" "../c/type_util.cpp" diff --git a/src/api/unified/symbol_manager.cpp b/src/api/unified/symbol_manager.cpp index 1139f99b3e..bc1f14b459 100644 --- a/src/api/unified/symbol_manager.cpp +++ b/src/api/unified/symbol_manager.cpp @@ -42,25 +42,6 @@ inline string getBkndLibName(const int backend_index) return LIB_AF_BKND_PREFIX + LIB_AF_BKND_NAME[i] + LIB_AF_BKND_SUFFIX; } -inline std::string getEnvVar(const std::string &key) -{ -#if defined(OS_WIN) - DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation - string retVal; - retVal.resize(bufSize); - bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize); - if (!bufSize) { - return string(""); - } else { - retVal.resize(bufSize); - return retVal; - } -#else - char * str = getenv(key.c_str()); - return str==NULL ? string("") : string(str); -#endif -} - /*flag parameter is not used on windows platform */ LibHandle openDynLibrary(const int bknd_idx, int flag=RTLD_LAZY) { diff --git a/src/api/unified/symbol_manager.hpp b/src/api/unified/symbol_manager.hpp index f4cf913ac6..eb33c20995 100644 --- a/src/api/unified/symbol_manager.hpp +++ b/src/api/unified/symbol_manager.hpp @@ -11,6 +11,8 @@ #include #include #include +#include + #if defined(OS_WIN) #include typedef HMODULE LibHandle; diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index adb72dc6c5..e0709d36d3 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include // cpu::Array class forward declaration namespace cpu diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index b0ab17a616..bf72a8a6fa 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -107,6 +107,7 @@ source_group(api\\c\\Headers FILES ${c_headers}) source_group(api\\c\\Sources FILES ${c_sources}) FILE(GLOB cpp_sources + "../../util.cpp" "../../api/cpp/*.cpp" ) diff --git a/src/backend/cpu/debug_cpu.hpp b/src/backend/cpu/debug_cpu.hpp index b1d8e17484..cbcdc2230a 100644 --- a/src/backend/cpu/debug_cpu.hpp +++ b/src/backend/cpu/debug_cpu.hpp @@ -9,7 +9,7 @@ #pragma once #include -#include +#include #include #ifndef NDEBUG diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index 98cfad4b53..6ae63a919e 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -249,8 +250,9 @@ int getActiveDeviceId() static const int MAX_QUEUES = 1; -async_queue& getQueue(int idx) { - static std::array queues; + +queue& getQueue(int idx) { + static std::array queues; return queues[idx]; } diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp index 10575520b5..0cd42ae068 100644 --- a/src/backend/cpu/platform.hpp +++ b/src/backend/cpu/platform.hpp @@ -11,9 +11,9 @@ #include -class async_queue; - namespace cpu { + class queue; + int getBackend(); std::string getInfo(); @@ -30,5 +30,5 @@ namespace cpu { void sync(int device); - async_queue& getQueue(int idx = 0); + queue& getQueue(int idx = 0); } diff --git a/src/backend/cpu/queue.hpp b/src/backend/cpu/queue.hpp new file mode 100644 index 0000000000..6e5cd71f33 --- /dev/null +++ b/src/backend/cpu/queue.hpp @@ -0,0 +1,42 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include + +#pragma once + +namespace cpu { + +/// Wraps the async_queue class +class queue { +public: + queue() + : sync_calls( getEnvVar("AF_SYNCHRONOUS_CALLS") == "1") {} + + template + void enqueue(const F func, Args... args) { + + if(sync_calls) { func( args... ); } + else { aQueue.enqueue( func, args... ); } + } + void sync() { + if(!sync_calls) aQueue.sync(); + } + + bool is_worker() const { + return (!sync_calls) ? aQueue.is_worker() : false; + } + +private: + const bool sync_calls; + async_queue aQueue; +}; + +} diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index bb8fca013c..ee7b86ff2c 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -178,6 +178,7 @@ FILE(GLOB c_headers ) FILE(GLOB c_sources + "../../util.cpp" "../../api/c/*.cpp" ) diff --git a/src/backend/cuda/debug_cuda.hpp b/src/backend/cuda/debug_cuda.hpp index 084d12f804..f5424950dc 100644 --- a/src/backend/cuda/debug_cuda.hpp +++ b/src/backend/cuda/debug_cuda.hpp @@ -51,8 +51,12 @@ #else -#define POST_LAUNCH_CHECK() do { \ - CUDA_CHECK(cudaPeekAtLastError()); \ - } while(0) \ +#define POST_LAUNCH_CHECK() do { \ + if(cuda::synchronize_calls()) { \ + CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); \ + } else { \ + CUDA_CHECK(cudaPeekAtLastError()); \ + } \ + } while(0) \ #endif diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 76b336c5ad..a263bea2ca 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -21,6 +21,7 @@ #include #include #include +#include using namespace std; @@ -393,6 +394,11 @@ void sync(int device) setDevice(currDevice); } +bool synchronize_calls() { + static bool sync = getEnvVar("AF_SYNCHRONOUS_CALLS") == "1"; + return sync; +} + } af_err afcu_get_stream(cudaStream_t* stream, int id) diff --git a/src/backend/cuda/platform.hpp b/src/backend/cuda/platform.hpp index 7b649686dc..20862fb886 100644 --- a/src/backend/cuda/platform.hpp +++ b/src/backend/cuda/platform.hpp @@ -50,6 +50,9 @@ int setDevice(int device); void sync(int device); +// Returns true if the AF_SYNCHRONIZE_CALLS environment variable is set to 1 +bool synchronize_calls(); + cudaDeviceProp getDeviceProp(int device); struct cudaDevice_t { diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index 86ba1b2aad..223752cc28 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -152,6 +152,7 @@ FILE(GLOB backend_headers ) FILE(GLOB backend_sources + "../../util.cpp" "../*.cpp" ) source_group(backend\\Headers FILES ${backend_headers}) diff --git a/src/backend/opencl/debug_opencl.hpp b/src/backend/opencl/debug_opencl.hpp index 74b3f7cf59..b4126f9abe 100644 --- a/src/backend/opencl/debug_opencl.hpp +++ b/src/backend/opencl/debug_opencl.hpp @@ -16,5 +16,10 @@ #include #define CL_DEBUG_FINISH(Q) Q.finish() #else -#define CL_DEBUG_FINISH(Q) +#define CL_DEBUG_FINISH(Q) \ + do { \ + if(synchronize_calls()) { \ + Q.finish(); \ + } \ + } while (false); #endif diff --git a/src/backend/opencl/kernel/convolve.hpp b/src/backend/opencl/kernel/convolve.hpp index 035f4c23aa..6d1d7de7ee 100644 --- a/src/backend/opencl/kernel/convolve.hpp +++ b/src/backend/opencl/kernel/convolve.hpp @@ -52,6 +52,7 @@ void convolve_nd(Param out, const Param signal, const Param filter, ConvolveBatc case 3: conv3(param, out, signal, filter); break; } + CL_DEBUG_FINISH(getQueue()); bufferFree(param.impulse); } diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 1301af9459..57726d2e87 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -41,6 +41,7 @@ #include #include #include +#include using std::string; using std::vector; @@ -556,6 +557,11 @@ void removeDeviceContext(cl_device_id dev, cl_context ctx) } } +bool synchronize_calls() { + static bool sync = getEnvVar("AF_SYNCHRONOUS_CALLS") == "1"; + return sync; +} + } using namespace opencl; diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 154d84bc8e..84cb7b854c 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -115,4 +115,6 @@ void removeDeviceContext(cl_device_id dev, cl_context ctx); void sync(int device); +bool synchronize_calls(); + } diff --git a/src/util.cpp b/src/util.cpp new file mode 100644 index 0000000000..5607292c0d --- /dev/null +++ b/src/util.cpp @@ -0,0 +1,40 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +/// This file contains platform independent utility functions +#include +#include + +#if defined(OS_WIN) +#include +typedef HMODULE LibHandle; +#else +#include +#endif + +using std::string; + +string getEnvVar(const std::string &key) +{ +#if defined(OS_WIN) + DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation + string retVal; + retVal.resize(bufSize); + bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize); + if (!bufSize) { + return string(""); + } else { + retVal.resize(bufSize); + return retVal; + } +#else + char * str = getenv(key.c_str()); + return str==NULL ? string("") : string(str); +#endif +} diff --git a/src/util.hpp b/src/util.hpp new file mode 100644 index 0000000000..e1cd85a69c --- /dev/null +++ b/src/util.hpp @@ -0,0 +1,16 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +/// This file contains platform independent utility functions + +#include + +#pragma once + +std::string getEnvVar(const std::string &key); From 6058dd283ea132cef41d834ef068afad3a719200 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Wed, 30 Dec 2015 10:32:00 -0500 Subject: [PATCH 110/288] Revert "Added ENQUEUE macro in cpu backend" This reverts commit 95d934613425559fa9048433bfe77bb8f151c18f. * Not necessary with the new queue class (see queue.hpp) * Macros bad --- src/backend/cpu/Array.cpp | 5 +++-- src/backend/cpu/approx.cpp | 19 ++++++++-------- src/backend/cpu/assign.cpp | 5 +++-- src/backend/cpu/bilateral.cpp | 5 +++-- src/backend/cpu/blas.cpp | 13 +++++------ src/backend/cpu/cholesky.cpp | 5 +++-- src/backend/cpu/convolve.cpp | 7 +++--- src/backend/cpu/copy.cpp | 9 ++++---- src/backend/cpu/debug_cpu.hpp | 31 --------------------------- src/backend/cpu/diagonal.cpp | 7 +++--- src/backend/cpu/diff.cpp | 7 +++--- src/backend/cpu/fast.cpp | 3 ++- src/backend/cpu/fft.cpp | 9 ++++---- src/backend/cpu/fftconvolve.cpp | 15 +++++++------ src/backend/cpu/gradient.cpp | 5 +++-- src/backend/cpu/harris.cpp | 13 +++++------ src/backend/cpu/hist_graphics.cpp | 3 ++- src/backend/cpu/histogram.cpp | 5 +++-- src/backend/cpu/homography.cpp | 3 ++- src/backend/cpu/hsv_rgb.cpp | 7 +++--- src/backend/cpu/identity.cpp | 5 +++-- src/backend/cpu/iir.cpp | 5 +++-- src/backend/cpu/image.cpp | 3 ++- src/backend/cpu/index.cpp | 5 +++-- src/backend/cpu/inverse.cpp | 5 +++-- src/backend/cpu/iota.cpp | 5 +++-- src/backend/cpu/ireduce.cpp | 5 +++-- src/backend/cpu/join.cpp | 25 ++++++++++----------- src/backend/cpu/lookup.cpp | 5 +++-- src/backend/cpu/lu.cpp | 9 ++++---- src/backend/cpu/match_template.cpp | 5 +++-- src/backend/cpu/meanshift.cpp | 5 +++-- src/backend/cpu/medfilt.cpp | 5 +++-- src/backend/cpu/memory.cpp | 3 ++- src/backend/cpu/morph.cpp | 7 +++--- src/backend/cpu/nearest_neighbour.cpp | 9 ++++---- src/backend/cpu/orb.cpp | 3 ++- src/backend/cpu/platform.cpp | 3 ++- src/backend/cpu/plot.cpp | 3 ++- src/backend/cpu/plot3.cpp | 3 ++- src/backend/cpu/qr.cpp | 7 +++--- src/backend/cpu/queue.hpp | 4 ++++ src/backend/cpu/random.cpp | 11 +++++----- src/backend/cpu/range.cpp | 11 +++++----- src/backend/cpu/reduce.cpp | 5 +++-- src/backend/cpu/regions.cpp | 5 +++-- src/backend/cpu/reorder.cpp | 5 +++-- src/backend/cpu/resize.cpp | 9 ++++---- src/backend/cpu/rotate.cpp | 9 ++++---- src/backend/cpu/scan.cpp | 11 +++++----- src/backend/cpu/select.cpp | 7 +++--- src/backend/cpu/set.cpp | 3 ++- src/backend/cpu/shift.cpp | 5 +++-- src/backend/cpu/sobel.cpp | 7 +++--- src/backend/cpu/solve.cpp | 11 +++++----- src/backend/cpu/sort.cpp | 5 +++-- src/backend/cpu/sort_by_key.cpp | 5 +++-- src/backend/cpu/sort_index.cpp | 5 +++-- src/backend/cpu/surface.cpp | 3 ++- src/backend/cpu/susan.cpp | 7 +++--- src/backend/cpu/svd.cpp | 5 +++-- src/backend/cpu/tile.cpp | 5 +++-- src/backend/cpu/transform.cpp | 9 ++++---- src/backend/cpu/transpose.cpp | 7 +++--- src/backend/cpu/triangle.cpp | 5 +++-- src/backend/cpu/unwrap.cpp | 7 +++--- src/backend/cpu/where.cpp | 3 ++- src/backend/cpu/wrap.cpp | 7 +++--- 68 files changed, 258 insertions(+), 219 deletions(-) delete mode 100644 src/backend/cpu/debug_cpu.hpp diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 34c99e4566..862c576afe 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #include #include @@ -77,7 +78,7 @@ void Array::eval() data = std::shared_ptr(memAlloc(elements()), memFree); - ENQUEUE(kernel::evalArray, *this); + getQueue().enqueue(kernel::evalArray, *this); ready = true; Node_ptr prev = node; diff --git a/src/backend/cpu/approx.cpp b/src/backend/cpu/approx.cpp index 57d3cc4c45..b817b840b4 100644 --- a/src/backend/cpu/approx.cpp +++ b/src/backend/cpu/approx.cpp @@ -11,7 +11,8 @@ #include #include #include -#include +#include +#include namespace cpu { @@ -30,12 +31,12 @@ Array approx1(const Array &in, const Array &pos, switch(method) { case AF_INTERP_NEAREST: - ENQUEUE(kernel::approx1, - out, in, pos, offGrid); + getQueue().enqueue(kernel::approx1, + out, in, pos, offGrid); break; case AF_INTERP_LINEAR: - ENQUEUE(kernel::approx1, - out, in, pos, offGrid); + getQueue().enqueue(kernel::approx1, + out, in, pos, offGrid); break; default: break; @@ -60,12 +61,12 @@ Array approx2(const Array &in, const Array &pos0, const Array &p switch(method) { case AF_INTERP_NEAREST: - ENQUEUE(kernel::approx2, - out, in, pos0, pos1, offGrid); + getQueue().enqueue(kernel::approx2, + out, in, pos0, pos1, offGrid); break; case AF_INTERP_LINEAR: - ENQUEUE(kernel::approx2, - out, in, pos0, pos1, offGrid); + getQueue().enqueue(kernel::approx2, + out, in, pos0, pos1, offGrid); break; default: break; diff --git a/src/backend/cpu/assign.cpp b/src/backend/cpu/assign.cpp index df903449a0..463b30c733 100644 --- a/src/backend/cpu/assign.cpp +++ b/src/backend/cpu/assign.cpp @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include namespace cpu { @@ -47,7 +48,7 @@ void assign(Array& out, const af_index_t idxrs[], const Array& rhs) } } - ENQUEUE(kernel::assign, out, rhs, std::move(isSeq), + getQueue().enqueue(kernel::assign, out, rhs, std::move(isSeq), std::move(seqs), std::move(idxArrs)); } diff --git a/src/backend/cpu/bilateral.cpp b/src/backend/cpu/bilateral.cpp index ceb8be95d9..abd985768d 100644 --- a/src/backend/cpu/bilateral.cpp +++ b/src/backend/cpu/bilateral.cpp @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include using af::dim4; @@ -28,7 +29,7 @@ Array bilateral(const Array &in, const float &s_sigma, const fl in.eval(); const dim4 dims = in.dims(); Array out = createEmptyArray(dims); - ENQUEUE(kernel::bilateral, out, in, s_sigma, c_sigma); + getQueue().enqueue(kernel::bilateral, out, in, s_sigma, c_sigma); return out; } diff --git a/src/backend/cpu/blas.cpp b/src/backend/cpu/blas.cpp index 70c8d9ca77..3ecb502ffa 100644 --- a/src/backend/cpu/blas.cpp +++ b/src/backend/cpu/blas.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include namespace cpu { @@ -193,7 +194,7 @@ Array matmul(const Array &lhs, const Array &rhs, reinterpret_cast(output.get()), output.dims()[0]); } }; - ENQUEUE(func, out, lhs, rhs); + getQueue().enqueue(func, out, lhs, rhs); return out; } @@ -207,13 +208,13 @@ Array dot(const Array &lhs, const Array &rhs, Array out = createEmptyArray(af::dim4(1)); if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - ENQUEUE(kernel::dot, out, lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot, out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - ENQUEUE(kernel::dot,out, lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot,out, lhs, rhs, optLhs, optRhs); } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - ENQUEUE(kernel::dot,out, rhs, lhs, optRhs, optLhs); + getQueue().enqueue(kernel::dot,out, rhs, lhs, optRhs, optLhs); } else { - ENQUEUE(kernel::dot,out, lhs, rhs, optLhs, optRhs); + getQueue().enqueue(kernel::dot,out, lhs, rhs, optLhs, optRhs); } return out; } diff --git a/src/backend/cpu/cholesky.cpp b/src/backend/cpu/cholesky.cpp index b21d9c8fd0..5e393f0082 100644 --- a/src/backend/cpu/cholesky.cpp +++ b/src/backend/cpu/cholesky.cpp @@ -19,7 +19,8 @@ #include #include #include -#include +#include +#include namespace cpu { @@ -74,7 +75,7 @@ int cholesky_inplace(Array &in, const bool is_upper) info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, N, in.get(), in.strides()[1]); }; - ENQUEUE(func, info, in); + getQueue().enqueue(func, info, in); getQueue().sync(); return info; diff --git a/src/backend/cpu/convolve.cpp b/src/backend/cpu/convolve.cpp index cf241c3eaa..8218a3f9a3 100644 --- a/src/backend/cpu/convolve.cpp +++ b/src/backend/cpu/convolve.cpp @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -50,7 +51,7 @@ Array convolve(Array const& signal, Array const& filter, ConvolveBat Array out = createEmptyArray(oDims); - ENQUEUE(kernel::convolve_nd,out, signal, filter, kind); + getQueue().enqueue(kernel::convolve_nd,out, signal, filter, kind); return out; } @@ -80,7 +81,7 @@ Array convolve2(Array const& signal, Array const& c_filter, Array out = createEmptyArray(oDims); - ENQUEUE(kernel::convolve2, out, signal, c_filter, r_filter, tDims); + getQueue().enqueue(kernel::convolve2, out, signal, c_filter, r_filter, tDims); return out; } diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 91a1513fd9..f844d959a2 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -18,7 +18,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -50,7 +51,7 @@ template void multiply_inplace(Array &in, double val) { in.eval(); - ENQUEUE(kernel::copy, in, in, 0, val); + getQueue().enqueue(kernel::copy, in, in, 0, val); } template @@ -60,7 +61,7 @@ Array padArray(Array const &in, dim4 const &dims, Array ret = createValueArray(dims, default_value); ret.eval(); in.eval(); - ENQUEUE(kernel::copy, ret, in, outType(default_value), factor); + getQueue().enqueue(kernel::copy, ret, in, outType(default_value), factor); return ret; } @@ -69,7 +70,7 @@ void copyArray(Array &out, Array const &in) { out.eval(); in.eval(); - ENQUEUE(kernel::copy, out, in, scalar(0), 1.0); + getQueue().enqueue(kernel::copy, out, in, scalar(0), 1.0); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/debug_cpu.hpp b/src/backend/cpu/debug_cpu.hpp deleted file mode 100644 index cbcdc2230a..0000000000 --- a/src/backend/cpu/debug_cpu.hpp +++ /dev/null @@ -1,31 +0,0 @@ -/******************************************************* - * Copyright (c) 2015, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#pragma once -#include -#include -#include - -#ifndef NDEBUG - -#define POST_LAUNCH_CHECK() do { \ - getQueue().sync(); \ - } while(0) \ - -#else - -#define POST_LAUNCH_CHECK() //no-op - -#endif - -#define ENQUEUE(...) \ - do { \ - getQueue().enqueue(__VA_ARGS__); \ - POST_LAUNCH_CHECK(); \ - } while(0) diff --git a/src/backend/cpu/diagonal.cpp b/src/backend/cpu/diagonal.cpp index 6fd918d66d..c818f82795 100644 --- a/src/backend/cpu/diagonal.cpp +++ b/src/backend/cpu/diagonal.cpp @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -30,7 +31,7 @@ Array diagCreate(const Array &in, const int num) int batch = in.dims()[1]; Array out = createEmptyArray(dim4(size, size, batch)); - ENQUEUE(kernel::diagCreate, out, in, num); + getQueue().enqueue(kernel::diagCreate, out, in, num); return out; } @@ -44,7 +45,7 @@ Array diagExtract(const Array &in, const int num) dim_t size = std::max(idims[0], idims[1]) - std::abs(num); Array out = createEmptyArray(dim4(size, 1, idims[2], idims[3])); - ENQUEUE(kernel::diagExtract, out, in, num); + getQueue().enqueue(kernel::diagExtract, out, in, num); return out; } diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp index efab130cc6..1e374e95da 100644 --- a/src/backend/cpu/diff.cpp +++ b/src/backend/cpu/diff.cpp @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include namespace cpu @@ -26,7 +27,7 @@ Array diff1(const Array &in, const int dim) Array outArray = createEmptyArray(dims); - ENQUEUE(kernel::diff1, outArray, in, dim); + getQueue().enqueue(kernel::diff1, outArray, in, dim); return outArray; } @@ -42,7 +43,7 @@ Array diff2(const Array &in, const int dim) Array outArray = createEmptyArray(dims); - ENQUEUE(kernel::diff2, outArray, in, dim); + getQueue().enqueue(kernel::diff2, outArray, in, dim); return outArray; } diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp index 1b3a7aa973..954f457cf4 100644 --- a/src/backend/cpu/fast.cpp +++ b/src/backend/cpu/fast.cpp @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; diff --git a/src/backend/cpu/fft.cpp b/src/backend/cpu/fft.cpp index 1282963003..3c1d10a4f3 100644 --- a/src/backend/cpu/fft.cpp +++ b/src/backend/cpu/fft.cpp @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include using af::dim4; @@ -26,7 +27,7 @@ template void fft_inplace(Array &in) { in.eval(); - ENQUEUE(kernel::fft_inplace, in); + getQueue().enqueue(kernel::fft_inplace, in); } template @@ -38,7 +39,7 @@ Array fft_r2c(const Array &in) odims[0] = odims[0] / 2 + 1; Array out = createEmptyArray(odims); - ENQUEUE(kernel::fft_r2c, out, in); + getQueue().enqueue(kernel::fft_r2c, out, in); return out; } @@ -49,7 +50,7 @@ Array fft_c2r(const Array &in, const dim4 &odims) in.eval(); Array out = createEmptyArray(odims); - ENQUEUE(kernel::fft_c2r, out, in, odims); + getQueue().enqueue(kernel::fft_c2r, out, in, odims); return out; } diff --git a/src/backend/cpu/fftconvolve.cpp b/src/backend/cpu/fftconvolve.cpp index aac66cdbe4..3b4b864452 100644 --- a/src/backend/cpu/fftconvolve.cpp +++ b/src/backend/cpu/fftconvolve.cpp @@ -17,7 +17,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -83,11 +84,11 @@ Array fftconvolve(Array const& signal, Array const& filter, // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s - ENQUEUE(kernel::packData, packed, sig_tmp_dims, sig_tmp_strides, signal); + getQueue().enqueue(kernel::packData, packed, sig_tmp_dims, sig_tmp_strides, signal); // Pad filter array with 0s const dim_t offset = sig_tmp_strides[3]*sig_tmp_dims[3]; - ENQUEUE(kernel::padArray, packed, filter_tmp_dims, filter_tmp_strides, + getQueue().enqueue(kernel::padArray, packed, filter_tmp_dims, filter_tmp_strides, filter, offset); dim4 fftDims(1, 1, 1, 1); @@ -137,10 +138,10 @@ Array fftconvolve(Array const& signal, Array const& filter, fftwf_destroy_plan(plan); } }; - ENQUEUE(upstream_dft, packed, fftDims); + getQueue().enqueue(upstream_dft, packed, fftDims); // Multiply filter and signal FFT arrays - ENQUEUE(kernel::complexMultiply, packed, + getQueue().enqueue(kernel::complexMultiply, packed, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims, filter_tmp_strides, kind, offset); @@ -188,7 +189,7 @@ Array fftconvolve(Array const& signal, Array const& filter, fftwf_destroy_plan(plan); } }; - ENQUEUE(upstream_idft, packed, fftDims); + getQueue().enqueue(upstream_idft, packed, fftDims); // Compute output dimensions dim4 oDims(1); @@ -210,7 +211,7 @@ Array fftconvolve(Array const& signal, Array const& filter, Array out = createEmptyArray(oDims); - ENQUEUE(kernel::reorder, out, packed, filter, + getQueue().enqueue(kernel::reorder, out, packed, filter, sig_half_d0, fftScale, sig_tmp_dims, sig_tmp_strides, filter_tmp_dims, filter_tmp_strides, expand, kind); diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp index 57776e5750..aa417f49e1 100644 --- a/src/backend/cpu/gradient.cpp +++ b/src/backend/cpu/gradient.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -25,7 +26,7 @@ void gradient(Array &grad0, Array &grad1, const Array &in) grad1.eval(); in.eval(); - ENQUEUE(kernel::gradient, grad0, grad1, in); + getQueue().enqueue(kernel::gradient, grad0, grad1, in); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/harris.cpp b/src/backend/cpu/harris.cpp index 07b9bed516..b5ea0ca20e 100644 --- a/src/backend/cpu/harris.cpp +++ b/src/backend/cpu/harris.cpp @@ -18,7 +18,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -52,14 +53,14 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out Array iy = createEmptyArray(idims); // Compute first order derivatives - ENQUEUE(gradient, iy, ix, in); + getQueue().enqueue(gradient, iy, ix, in); Array ixx = createEmptyArray(idims); Array ixy = createEmptyArray(idims); Array iyy = createEmptyArray(idims); // Compute second-order derivatives - ENQUEUE(kernel::second_order_deriv, ixx, ixy, iyy, in.elements(), ix, iy); + getQueue().enqueue(kernel::second_order_deriv, ixx, ixy, iyy, in.elements(), ix, iy); // Convolve second-order derivatives with proper window filter ixx = convolve2(ixx, filter, filter); @@ -70,7 +71,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out Array responses = createEmptyArray(dim4(in.elements())); - ENQUEUE(kernel::harris_responses, responses, idims[0], idims[1], + getQueue().enqueue(kernel::harris_responses, responses, idims[0], idims[1], ixx, ixy, iyy, k_thr, border_len); Array xCorners = createEmptyArray(dim4(corner_lim)); @@ -104,7 +105,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out resp_out = createEmptyArray(dim4(corners_out)); // Keep only the corners with higher Harris responses - ENQUEUE(kernel::keep_corners, x_out, y_out, resp_out, xCorners, yCorners, + getQueue().enqueue(kernel::keep_corners, x_out, y_out, resp_out, xCorners, yCorners, harris_sorted, harris_idx, corners_out); } else if (max_corners == 0 && corners_found < corner_lim) { x_out = createEmptyArray(dim4(corners_out)); @@ -119,7 +120,7 @@ unsigned harris(Array &x_out, Array &y_out, Array &resp_out memcpy(y_out.get(), y_crnrs.get(), corners_out * sizeof(float)); memcpy(outResponses.get(), inResponses.get(), corners_out * sizeof(float)); }; - ENQUEUE(copyFunc, x_out, y_out, resp_out, + getQueue().enqueue(copyFunc, x_out, y_out, resp_out, xCorners, yCorners, respCorners, corners_out); } else { x_out = xCorners; diff --git a/src/backend/cpu/hist_graphics.cpp b/src/backend/cpu/hist_graphics.cpp index c58f5c687e..ad7d69067d 100644 --- a/src/backend/cpu/hist_graphics.cpp +++ b/src/backend/cpu/hist_graphics.cpp @@ -11,7 +11,8 @@ #include #include -#include +#include +#include namespace cpu { diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp index 2571f3e4d0..6aa60e59e4 100644 --- a/src/backend/cpu/histogram.cpp +++ b/src/backend/cpu/histogram.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -31,7 +32,7 @@ Array histogram(const Array &in, Array out = createValueArray(outDims, outType(0)); out.eval(); - ENQUEUE(kernel::histogram, + getQueue().enqueue(kernel::histogram, out, in, nbins, minval, maxval); return out; diff --git a/src/backend/cpu/homography.cpp b/src/backend/cpu/homography.cpp index 147f5e8751..4d131cf695 100644 --- a/src/backend/cpu/homography.cpp +++ b/src/backend/cpu/homography.cpp @@ -18,7 +18,8 @@ #include #include #include -#include +#include +#include using af::dim4; diff --git a/src/backend/cpu/hsv_rgb.cpp b/src/backend/cpu/hsv_rgb.cpp index da5dbe0594..404491766c 100644 --- a/src/backend/cpu/hsv_rgb.cpp +++ b/src/backend/cpu/hsv_rgb.cpp @@ -11,7 +11,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -26,7 +27,7 @@ Array hsv2rgb(const Array& in) Array out = createEmptyArray(in.dims()); - ENQUEUE(kernel::hsv2rgb, out, in); + getQueue().enqueue(kernel::hsv2rgb, out, in); return out; } @@ -38,7 +39,7 @@ Array rgb2hsv(const Array& in) Array out = createEmptyArray(in.dims()); - ENQUEUE(kernel::rgb2hsv, out, in); + getQueue().enqueue(kernel::rgb2hsv, out, in); return out; } diff --git a/src/backend/cpu/identity.cpp b/src/backend/cpu/identity.cpp index 071bb04642..c5e11029fc 100644 --- a/src/backend/cpu/identity.cpp +++ b/src/backend/cpu/identity.cpp @@ -10,7 +10,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -21,7 +22,7 @@ Array identity(const dim4& dims) { Array out = createEmptyArray(dims); - ENQUEUE(kernel::identity, out); + getQueue().enqueue(kernel::identity, out); return out; } diff --git a/src/backend/cpu/iir.cpp b/src/backend/cpu/iir.cpp index cb390b3018..049212ad69 100644 --- a/src/backend/cpu/iir.cpp +++ b/src/backend/cpu/iir.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -41,7 +42,7 @@ Array iir(const Array &b, const Array &a, const Array &x) Array y = createEmptyArray(c.dims()); - ENQUEUE(kernel::iir, y, c, a); + getQueue().enqueue(kernel::iir, y, c, a); return y; } diff --git a/src/backend/cpu/image.cpp b/src/backend/cpu/image.cpp index d23ba80ba8..b71ba23c12 100644 --- a/src/backend/cpu/image.cpp +++ b/src/backend/cpu/image.cpp @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include using af::dim4; diff --git a/src/backend/cpu/index.cpp b/src/backend/cpu/index.cpp index 9c951ff0d3..a2cdac888f 100644 --- a/src/backend/cpu/index.cpp +++ b/src/backend/cpu/index.cpp @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include #include #include @@ -57,7 +58,7 @@ Array index(const Array& in, const af_index_t idxrs[]) Array out = createEmptyArray(oDims); - ENQUEUE(kernel::index, out, in, std::move(isSeq), std::move(seqs), std::move(idxArrs)); + getQueue().enqueue(kernel::index, out, in, std::move(isSeq), std::move(seqs), std::move(idxArrs)); return out; } diff --git a/src/backend/cpu/inverse.cpp b/src/backend/cpu/inverse.cpp index 71cc9fefca..ea7d7ee828 100644 --- a/src/backend/cpu/inverse.cpp +++ b/src/backend/cpu/inverse.cpp @@ -23,7 +23,8 @@ #include #include #include -#include +#include +#include namespace cpu { @@ -67,7 +68,7 @@ Array inverse(const Array &in) A.get(), A.strides()[1], pivot.get()); }; - ENQUEUE(func, A, pivot, M); + getQueue().enqueue(func, A, pivot, M); return A; } diff --git a/src/backend/cpu/iota.cpp b/src/backend/cpu/iota.cpp index 124ec5c48a..db19708b46 100644 --- a/src/backend/cpu/iota.cpp +++ b/src/backend/cpu/iota.cpp @@ -10,7 +10,8 @@ #include #include #include -#include +#include +#include #include using namespace std; @@ -25,7 +26,7 @@ Array iota(const dim4 &dims, const dim4 &tile_dims) Array out = createEmptyArray(outdims); - ENQUEUE(kernel::iota, out, dims, tile_dims); + getQueue().enqueue(kernel::iota, out, dims, tile_dims); return out; } diff --git a/src/backend/cpu/ireduce.cpp b/src/backend/cpu/ireduce.cpp index 9de4a781b3..a40fbdf958 100644 --- a/src/backend/cpu/ireduce.cpp +++ b/src/backend/cpu/ireduce.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -39,7 +40,7 @@ void ireduce(Array &out, Array &loc, const Array &in, const int dim) , kernel::ireduce_dim() , kernel::ireduce_dim()}; - ENQUEUE(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); + getQueue().enqueue(ireduce_funcs[in.ndims() - 1], out, loc, 0, in, 0, dim); } template diff --git a/src/backend/cpu/join.cpp b/src/backend/cpu/join.cpp index 6c9ba8ff9b..0a5b99cd13 100644 --- a/src/backend/cpu/join.cpp +++ b/src/backend/cpu/join.cpp @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include namespace cpu @@ -37,7 +38,7 @@ Array join(const int dim, const Array &first, const Array &second) Array out = createEmptyArray(odims); - ENQUEUE(kernel::join, out, dim, first, second); + getQueue().enqueue(kernel::join, out, dim, first, second); return out; } @@ -71,34 +72,34 @@ Array join(const int dim, const std::vector> &inputs) switch(n_arrays) { case 1: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 2: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 3: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 4: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 5: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 6: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 7: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 8: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 9: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; case 10: - ENQUEUE(kernel::join, dim, out, inputs); + getQueue().enqueue(kernel::join, dim, out, inputs); break; } diff --git a/src/backend/cpu/lookup.cpp b/src/backend/cpu/lookup.cpp index 4cc5359002..1e09f4dd48 100644 --- a/src/backend/cpu/lookup.cpp +++ b/src/backend/cpu/lookup.cpp @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include namespace cpu @@ -29,7 +30,7 @@ Array lookup(const Array &input, const Array &indices, const Array out = createEmptyArray(oDims); - ENQUEUE(kernel::lookup, out, input, indices, dim); + getQueue().enqueue(kernel::lookup, out, input, indices, dim); return out; } diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index 551c9c98e2..93862f24c0 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -17,7 +17,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -58,7 +59,7 @@ void lu(Array &lower, Array &upper, Array &pivot, const Array &in) lower = createEmptyArray(ldims); upper = createEmptyArray(udims); - ENQUEUE(kernel::lu_split, lower, upper, in_copy); + getQueue().enqueue(kernel::lu_split, lower, upper, in_copy); } template @@ -73,11 +74,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) dim4 iDims = in.dims(); getrf_func()(AF_LAPACK_COL_MAJOR, iDims[0], iDims[1], in.get(), in.strides()[1], pivot.get()); }; - ENQUEUE(func, in, pivot); + getQueue().enqueue(func, in, pivot); if(convert_pivot) { Array p = range(dim4(iDims[0]), 0); - ENQUEUE(kernel::convertPivot, p, pivot); + getQueue().enqueue(kernel::convertPivot, p, pivot); return p; } else { return pivot; diff --git a/src/backend/cpu/match_template.cpp b/src/backend/cpu/match_template.cpp index 724b773638..58091a1f49 100644 --- a/src/backend/cpu/match_template.cpp +++ b/src/backend/cpu/match_template.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -28,7 +29,7 @@ Array match_template(const Array &sImg, const Array &tImg) Array out = createEmptyArray(sImg.dims()); - ENQUEUE(kernel::matchTemplate, out, sImg, tImg); + getQueue().enqueue(kernel::matchTemplate, out, sImg, tImg); return out; } diff --git a/src/backend/cpu/meanshift.cpp b/src/backend/cpu/meanshift.cpp index f4a0b29e86..b5bbf758a1 100644 --- a/src/backend/cpu/meanshift.cpp +++ b/src/backend/cpu/meanshift.cpp @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -32,7 +33,7 @@ Array meanshift(const Array &in, const float &s_sigma, const float &c_sig Array out = createEmptyArray(in.dims()); - ENQUEUE(kernel::meanShift, out, in, s_sigma, c_sigma, iter); + getQueue().enqueue(kernel::meanShift, out, in, s_sigma, c_sigma, iter); return out; } diff --git a/src/backend/cpu/medfilt.cpp b/src/backend/cpu/medfilt.cpp index 9e761c6cc0..8ae4e33921 100644 --- a/src/backend/cpu/medfilt.cpp +++ b/src/backend/cpu/medfilt.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -27,7 +28,7 @@ Array medfilt(const Array &in, dim_t w_len, dim_t w_wid) Array out = createEmptyArray(in.dims()); - ENQUEUE(kernel::medfilt, out, in, w_len, w_wid); + getQueue().enqueue(kernel::medfilt, out, in, w_len, w_wid); return out; } diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 79f2e57a0c..85ba4f27fb 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include namespace cpu { diff --git a/src/backend/cpu/morph.cpp b/src/backend/cpu/morph.cpp index 337e8a9574..1ae4680b9d 100644 --- a/src/backend/cpu/morph.cpp +++ b/src/backend/cpu/morph.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -29,7 +30,7 @@ Array morph(const Array &in, const Array &mask) Array out = createEmptyArray(in.dims()); - ENQUEUE(kernel::morph, out, in, mask); + getQueue().enqueue(kernel::morph, out, in, mask); return out; } @@ -42,7 +43,7 @@ Array morph3d(const Array &in, const Array &mask) Array out = createEmptyArray(in.dims()); - ENQUEUE(kernel::morph3d, out, in, mask); + getQueue().enqueue(kernel::morph3d, out, in, mask); return out; } diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp index a3c2bb1ea9..f1daba7526 100644 --- a/src/backend/cpu/nearest_neighbour.cpp +++ b/src/backend/cpu/nearest_neighbour.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -42,13 +43,13 @@ void nearest_neighbour(Array& idx, Array& dist, switch(dist_type) { case AF_SAD: - ENQUEUE(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; case AF_SSD: - ENQUEUE(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; case AF_SHD: - ENQUEUE(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); + getQueue().enqueue(kernel::nearest_neighbour, idx, dist, query, train, dist_dim, n_dist); break; default: AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED); diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp index 649619e143..8bbfd41932 100644 --- a/src/backend/cpu/orb.cpp +++ b/src/backend/cpu/orb.cpp @@ -18,7 +18,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index 6ae63a919e..19942f0312 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -9,8 +9,9 @@ #include #include -#include +#include #include +#include #include #include #include diff --git a/src/backend/cpu/plot.cpp b/src/backend/cpu/plot.cpp index 8afdea288f..2ab69643c8 100644 --- a/src/backend/cpu/plot.cpp +++ b/src/backend/cpu/plot.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include using af::dim4; diff --git a/src/backend/cpu/plot3.cpp b/src/backend/cpu/plot3.cpp index c7beed69d6..515fe0336c 100644 --- a/src/backend/cpu/plot3.cpp +++ b/src/backend/cpu/plot3.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include using af::dim4; diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp index ca04ec9c20..34a39f64b8 100644 --- a/src/backend/cpu/qr.cpp +++ b/src/backend/cpu/qr.cpp @@ -17,7 +17,8 @@ #include #include #include -#include +#include +#include namespace cpu { @@ -78,7 +79,7 @@ void qr(Array &q, Array &r, Array &t, const Array &in) gqr_func()(AF_LAPACK_COL_MAJOR, M, M, min(M, N), q.get(), q.strides()[1], t.get()); }; q.resetDims(dim4(M, M)); - ENQUEUE(func, q, t, M, N); + getQueue().enqueue(func, q, t, M, N); } template @@ -94,7 +95,7 @@ Array qr_inplace(Array &in) auto func = [=] (Array in, Array t, int M, int N) { geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, in.get(), in.strides()[1], t.get()); }; - ENQUEUE(func, in, t, M, N); + getQueue().enqueue(func, in, t, M, N); return t; } diff --git a/src/backend/cpu/queue.hpp b/src/backend/cpu/queue.hpp index 6e5cd71f33..942ae259b1 100644 --- a/src/backend/cpu/queue.hpp +++ b/src/backend/cpu/queue.hpp @@ -25,6 +25,10 @@ class queue { if(sync_calls) { func( args... ); } else { aQueue.enqueue( func, args... ); } +#ifndef NDEBUG + sync(); +#endif + } void sync() { if(!sync_calls) aQueue.sync(); diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp index f49420f13d..89d86c3848 100644 --- a/src/backend/cpu/random.cpp +++ b/src/backend/cpu/random.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -22,7 +23,7 @@ template Array randu(const af::dim4 &dims) { Array outArray = createEmptyArray(dims); - ENQUEUE(kernel::randu, outArray); + getQueue().enqueue(kernel::randu, outArray); return outArray; } @@ -45,7 +46,7 @@ template Array randn(const af::dim4 &dims) { Array outArray = createEmptyArray(dims); - ENQUEUE(kernel::randn, outArray); + getQueue().enqueue(kernel::randn, outArray); return outArray; } @@ -80,7 +81,7 @@ Array randu(const af::dim4 &dims) outPtr[i] = gen() > 0.5; } }; - ENQUEUE(func, outArray); + getQueue().enqueue(func, outArray); return outArray; } @@ -92,7 +93,7 @@ void setSeed(const uintl seed) kernel::is_first = false; kernel::gen_seed = seed; }; - ENQUEUE(f, seed); + getQueue().enqueue(f, seed); } uintl getSeed() diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp index 6be78d5d0e..e91ba1e241 100644 --- a/src/backend/cpu/range.cpp +++ b/src/backend/cpu/range.cpp @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -32,10 +33,10 @@ Array range(const dim4& dims, const int seq_dim) Array out = createEmptyArray(dims); switch(_seq_dim) { - case 0: ENQUEUE(kernel::range, out); break; - case 1: ENQUEUE(kernel::range, out); break; - case 2: ENQUEUE(kernel::range, out); break; - case 3: ENQUEUE(kernel::range, out); break; + case 0: getQueue().enqueue(kernel::range, out); break; + case 1: getQueue().enqueue(kernel::range, out); break; + case 2: getQueue().enqueue(kernel::range, out); break; + case 3: getQueue().enqueue(kernel::range, out); break; default : AF_ERROR("Invalid rep selection", AF_ERR_ARG); } diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp index 90ad1f9023..2d4d18e682 100644 --- a/src/backend/cpu/reduce.cpp +++ b/src/backend/cpu/reduce.cpp @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -55,7 +56,7 @@ Array reduce(const Array &in, const int dim, bool change_nan, double nan , kernel::reduce_dim() , kernel::reduce_dim()}; - ENQUEUE(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval); + getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval); return out; } diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp index eafc161ff5..2384dd3341 100644 --- a/src/backend/cpu/regions.cpp +++ b/src/backend/cpu/regions.cpp @@ -17,7 +17,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -33,7 +34,7 @@ Array regions(const Array &in, af_connectivity connectivity) Array out = createValueArray(in.dims(), (T)0); out.eval(); - ENQUEUE(kernel::regions, out, in, connectivity); + getQueue().enqueue(kernel::regions, out, in, connectivity); return out; } diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp index 237e5d687a..bd156585ee 100644 --- a/src/backend/cpu/reorder.cpp +++ b/src/backend/cpu/reorder.cpp @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include namespace cpu @@ -26,7 +27,7 @@ Array reorder(const Array &in, const af::dim4 &rdims) oDims[i] = iDims[rdims[i]]; Array out = createEmptyArray(oDims); - ENQUEUE(kernel::reorder, out, in, oDims, rdims); + getQueue().enqueue(kernel::reorder, out, in, oDims, rdims); return out; } diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp index d6349a9c0b..eaeb5d4e3d 100644 --- a/src/backend/cpu/resize.cpp +++ b/src/backend/cpu/resize.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -31,11 +32,11 @@ Array resize(const Array &in, const dim_t odim0, const dim_t odim1, switch(method) { case AF_INTERP_NEAREST: - ENQUEUE(kernel::resize, out, in); break; + getQueue().enqueue(kernel::resize, out, in); break; case AF_INTERP_BILINEAR: - ENQUEUE(kernel::resize, out, in); break; + getQueue().enqueue(kernel::resize, out, in); break; case AF_INTERP_LOWER: - ENQUEUE(kernel::resize, out, in); break; + getQueue().enqueue(kernel::resize, out, in); break; default: break; } return out; diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp index 289f3697a0..0fb9b17674 100644 --- a/src/backend/cpu/rotate.cpp +++ b/src/backend/cpu/rotate.cpp @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include "transform_interp.hpp" #include @@ -26,13 +27,13 @@ Array rotate(const Array &in, const float theta, const af::dim4 &odims, switch(method) { case AF_INTERP_NEAREST: - ENQUEUE(kernel::rotate, out, in, theta); + getQueue().enqueue(kernel::rotate, out, in, theta); break; case AF_INTERP_BILINEAR: - ENQUEUE(kernel::rotate, out, in, theta); + getQueue().enqueue(kernel::rotate, out, in, theta); break; case AF_INTERP_LOWER: - ENQUEUE(kernel::rotate, out, in, theta); + getQueue().enqueue(kernel::rotate, out, in, theta); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp index adeb3d23b7..08431f8baa 100644 --- a/src/backend/cpu/scan.cpp +++ b/src/backend/cpu/scan.cpp @@ -14,7 +14,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -33,19 +34,19 @@ Array scan(const Array& in, const int dim) switch (in.ndims()) { case 1: kernel::scan_dim func1; - ENQUEUE(func1, out, 0, in, 0, dim); + getQueue().enqueue(func1, out, 0, in, 0, dim); break; case 2: kernel::scan_dim func2; - ENQUEUE(func2, out, 0, in, 0, dim); + getQueue().enqueue(func2, out, 0, in, 0, dim); break; case 3: kernel::scan_dim func3; - ENQUEUE(func3, out, 0, in, 0, dim); + getQueue().enqueue(func3, out, 0, in, 0, dim); break; case 4: kernel::scan_dim func4; - ENQUEUE(func4, out, 0, in, 0, dim); + getQueue().enqueue(func4, out, 0, in, 0, dim); break; } diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp index 4f845bc084..1545a81f46 100644 --- a/src/backend/cpu/select.cpp +++ b/src/backend/cpu/select.cpp @@ -10,7 +10,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -25,7 +26,7 @@ void select(Array &out, const Array &cond, const Array &a, const Arr cond.eval(); a.eval(); b.eval(); - ENQUEUE(kernel::select, out, cond, a, b); + getQueue().enqueue(kernel::select, out, cond, a, b); } template @@ -34,7 +35,7 @@ void select_scalar(Array &out, const Array &cond, const Array &a, co out.eval(); cond.eval(); a.eval(); - ENQUEUE(kernel::select_scalar, out, cond, a, b); + getQueue().enqueue(kernel::select_scalar, out, cond, a, b); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/set.cpp b/src/backend/cpu/set.cpp index 49ce186412..d6c2a611e0 100644 --- a/src/backend/cpu/set.cpp +++ b/src/backend/cpu/set.cpp @@ -18,7 +18,8 @@ #include #include #include -#include +#include +#include namespace cpu { diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp index fd56e4ce2e..041f1ab8ba 100644 --- a/src/backend/cpu/shift.cpp +++ b/src/backend/cpu/shift.cpp @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include namespace cpu @@ -23,7 +24,7 @@ Array shift(const Array &in, const int sdims[4]) Array out = createEmptyArray(in.dims()); const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]); - ENQUEUE(kernel::shift, out, in, temp); + getQueue().enqueue(kernel::shift, out, in, temp); return out; } diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp index 86c7363c6d..5ece9bf65e 100644 --- a/src/backend/cpu/sobel.cpp +++ b/src/backend/cpu/sobel.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include using af::dim4; @@ -31,8 +32,8 @@ sobelDerivatives(const Array &img, const unsigned &ker_size) Array dx = createEmptyArray(img.dims()); Array dy = createEmptyArray(img.dims()); - ENQUEUE(kernel::derivative, dx, img); - ENQUEUE(kernel::derivative, dy, img); + getQueue().enqueue(kernel::derivative, dx, img); + getQueue().enqueue(kernel::derivative, dy, img); return std::make_pair(dx, dy); } diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp index 5d1ec3bba3..48ea4de3c5 100644 --- a/src/backend/cpu/solve.cpp +++ b/src/backend/cpu/solve.cpp @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include namespace cpu { @@ -87,7 +88,7 @@ Array solveLU(const Array &A, const Array &pivot, N, NRHS, A.get(), A.strides()[1], pivot.get(), B.get(), B.strides()[1]); }; - ENQUEUE(func, A, B, pivot, N, NRHS); + getQueue().enqueue(func, A, B, pivot, N, NRHS); return B; } @@ -108,7 +109,7 @@ Array triangleSolve(const Array &A, const Array &b, const af_mat_prop o A.get(), A.strides()[1], B.get(), B.strides()[1]); }; - ENQUEUE(func, A, B, N, NRHS, options); + getQueue().enqueue(func, A, B, N, NRHS, options); return B; } @@ -138,7 +139,7 @@ Array solve(const Array &a, const Array &b, const af_mat_prop options) gesv_func()(AF_LAPACK_COL_MAJOR, N, K, A.get(), A.strides()[1], pivot.get(), B.get(), B.strides()[1]); }; - ENQUEUE(func, A, B, pivot, N, K); + getQueue().enqueue(func, A, B, pivot, N, K); } else { auto func = [=] (Array A, Array B, int M, int N, int K) { int sM = A.strides()[1]; @@ -150,7 +151,7 @@ Array solve(const Array &a, const Array &b, const af_mat_prop options) B.get(), max(sM, sN)); }; B.resetDims(dim4(N, K)); - ENQUEUE(func, A, B, M, N, K); + getQueue().enqueue(func, A, B, M, N, K); } return B; diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index 104a3df2eb..bc6396b258 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -26,7 +27,7 @@ Array sort(const Array &in, const unsigned dim) Array out = copyArray(in); switch(dim) { - case 0: ENQUEUE(kernel::sort0, out); break; + case 0: getQueue().enqueue(kernel::sort0, out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } return out; diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index c6832881d8..5a99257033 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include namespace cpu @@ -28,7 +29,7 @@ void sort_by_key(Array &okey, Array &oval, oidx.eval(); switch(dim) { - case 0: ENQUEUE(kernel::sort0_by_key, + case 0: getQueue().enqueue(kernel::sort0_by_key, okey, oval, oidx, ikey, ival); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index c8c6d6e08f..77860ede18 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -26,7 +27,7 @@ void sort_index(Array &val, Array &idx, const Array &in, const uint val = createEmptyArray(in.dims()); idx = createEmptyArray(in.dims()); switch(dim) { - case 0: ENQUEUE(kernel::sort0_index, val, idx, in); break; + case 0: getQueue().enqueue(kernel::sort0_index, val, idx, in); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } } diff --git a/src/backend/cpu/surface.cpp b/src/backend/cpu/surface.cpp index 00d2b00c0f..24c945c20b 100644 --- a/src/backend/cpu/surface.cpp +++ b/src/backend/cpu/surface.cpp @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include using af::dim4; diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp index 4f1c327dd3..55a2357206 100644 --- a/src/backend/cpu/susan.cpp +++ b/src/backend/cpu/susan.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include using af::features; @@ -39,9 +40,9 @@ unsigned susan(Array &x_out, Array &y_out, Array &resp_out, auto corners_found= std::shared_ptr(memAlloc(1), memFree); corners_found.get()[0] = 0; - ENQUEUE(kernel::susan_responses, response, in, idims[0], idims[1], + getQueue().enqueue(kernel::susan_responses, response, in, idims[0], idims[1], radius, diff_thr, geom_thr, edge); - ENQUEUE(kernel::non_maximal, x_corners, y_corners, resp_corners, corners_found, + getQueue().enqueue(kernel::non_maximal, x_corners, y_corners, resp_corners, corners_found, idims[0], idims[1], response, edge, corner_lim); getQueue().sync(); diff --git a/src/backend/cpu/svd.cpp b/src/backend/cpu/svd.cpp index 3ce627c5f9..2ac58aab3f 100644 --- a/src/backend/cpu/svd.cpp +++ b/src/backend/cpu/svd.cpp @@ -15,7 +15,8 @@ #if defined(WITH_CPU_LINEAR_ALGEBRA) #include #include -#include +#include +#include namespace cpu { @@ -86,7 +87,7 @@ void svdInPlace(Array &s, Array &u, Array &vt, Array &in) s.get(), u.get(), u.strides()[1], vt.get(), vt.strides()[1], &superb[0]); #endif }; - ENQUEUE(func, s, u, vt, in); + getQueue().enqueue(func, s, u, vt, in); } template diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp index 9237a79eb9..6526917d3a 100644 --- a/src/backend/cpu/tile.cpp +++ b/src/backend/cpu/tile.cpp @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include namespace cpu @@ -30,7 +31,7 @@ Array tile(const Array &in, const af::dim4 &tileDims) Array out = createEmptyArray(oDims); - ENQUEUE(kernel::tile, out, in); + getQueue().enqueue(kernel::tile, out, in); return out; } diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp index 5874e7abd0..fc7145854b 100644 --- a/src/backend/cpu/transform.cpp +++ b/src/backend/cpu/transform.cpp @@ -10,7 +10,8 @@ #include #include #include -#include +#include +#include #include "transform_interp.hpp" #include @@ -28,13 +29,13 @@ Array transform(const Array &in, const Array &transform, const af:: switch(method) { case AF_INTERP_NEAREST : - ENQUEUE(kernel::transform, out, in, transform, inverse); + getQueue().enqueue(kernel::transform, out, in, transform, inverse); break; case AF_INTERP_BILINEAR: - ENQUEUE(kernel::transform, out, in, transform, inverse); + getQueue().enqueue(kernel::transform, out, in, transform, inverse); break; case AF_INTERP_LOWER : - ENQUEUE(kernel::transform, out, in, transform, inverse); + getQueue().enqueue(kernel::transform, out, in, transform, inverse); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp index c1d5d1d236..32663e1f94 100644 --- a/src/backend/cpu/transpose.cpp +++ b/src/backend/cpu/transpose.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -32,7 +33,7 @@ Array transpose(const Array &in, const bool conjugate) // create an array with first two dimensions swapped Array out = createEmptyArray(outDims); - ENQUEUE(kernel::transpose, out, in, conjugate); + getQueue().enqueue(kernel::transpose, out, in, conjugate); return out; } @@ -41,7 +42,7 @@ template void transpose_inplace(Array &in, const bool conjugate) { in.eval(); - ENQUEUE(kernel::transpose_inplace, in, conjugate); + getQueue().enqueue(kernel::transpose_inplace, in, conjugate); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp index fbc7f658d0..2a9553c83a 100644 --- a/src/backend/cpu/triangle.cpp +++ b/src/backend/cpu/triangle.cpp @@ -12,7 +12,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -21,7 +22,7 @@ namespace cpu template void triangle(Array &out, const Array &in) { - ENQUEUE(kernel::triangle, out, in); + getQueue().enqueue(kernel::triangle, out, in); } template diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp index d40acde555..1aa37a4762 100644 --- a/src/backend/cpu/unwrap.cpp +++ b/src/backend/cpu/unwrap.cpp @@ -11,7 +11,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -36,9 +37,9 @@ Array unwrap(const Array &in, const dim_t wx, const dim_t wy, Array outArray = createEmptyArray(odims); if (is_column) { - ENQUEUE(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); + getQueue().enqueue(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); } else { - ENQUEUE(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); + getQueue().enqueue(kernel::unwrap_dim, outArray, in, wx, wy, sx, sy, px, py); } return outArray; diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp index 734b768385..018cbdfc36 100644 --- a/src/backend/cpu/where.cpp +++ b/src/backend/cpu/where.cpp @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include using af::dim4; diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp index 87de234d36..07487e0d68 100644 --- a/src/backend/cpu/wrap.cpp +++ b/src/backend/cpu/wrap.cpp @@ -11,7 +11,8 @@ #include #include #include -#include +#include +#include #include namespace cpu @@ -33,9 +34,9 @@ Array wrap(const Array &in, in.eval(); if (is_column) { - ENQUEUE(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); + getQueue().enqueue(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); } else { - ENQUEUE(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); + getQueue().enqueue(kernel::wrap_dim, out, in, wx, wy, sx, sy, px, py); } return out; From 1dd21957148047897d387cb154c3856ad90a8d32 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 10:25:26 -0500 Subject: [PATCH 111/288] Cleanup util.cpp --- CMakeLists.txt | 1 - src/api/unified/CMakeLists.txt | 2 +- src/backend/cpu/CMakeLists.txt | 1 - src/backend/cuda/CMakeLists.txt | 1 - src/backend/opencl/CMakeLists.txt | 1 - src/{ => backend}/util.cpp | 25 +++++++++++-------------- src/{ => backend}/util.hpp | 0 7 files changed, 12 insertions(+), 19 deletions(-) rename src/{ => backend}/util.cpp (59%) rename src/{ => backend}/util.hpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index bc00e1542a..c79fbcaab0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,7 +113,6 @@ IF(BUILD_SIFT) ENDIF(BUILD_SIFT) INCLUDE_DIRECTORIES( - "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src/backend" "${CMAKE_CURRENT_SOURCE_DIR}/src/api/c" diff --git a/src/api/unified/CMakeLists.txt b/src/api/unified/CMakeLists.txt index b6980d6bb3..21c9aebf97 100644 --- a/src/api/unified/CMakeLists.txt +++ b/src/api/unified/CMakeLists.txt @@ -15,12 +15,12 @@ FILE(GLOB cpp_sources SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources}) FILE(GLOB common_sources - "../../util.cpp" "../c/util.cpp" "../c/err_common.cpp" "../c/type_util.cpp" "../c/version.cpp" "../../backend/dim4.cpp" + "../../backend/util.cpp" ) SOURCE_GROUP(common FILES ${common_sources}) diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index bf72a8a6fa..b0ab17a616 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -107,7 +107,6 @@ source_group(api\\c\\Headers FILES ${c_headers}) source_group(api\\c\\Sources FILES ${c_sources}) FILE(GLOB cpp_sources - "../../util.cpp" "../../api/cpp/*.cpp" ) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index ee7b86ff2c..bb8fca013c 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -178,7 +178,6 @@ FILE(GLOB c_headers ) FILE(GLOB c_sources - "../../util.cpp" "../../api/c/*.cpp" ) diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index 223752cc28..86ba1b2aad 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -152,7 +152,6 @@ FILE(GLOB backend_headers ) FILE(GLOB backend_sources - "../../util.cpp" "../*.cpp" ) source_group(backend\\Headers FILES ${backend_headers}) diff --git a/src/util.cpp b/src/backend/util.cpp similarity index 59% rename from src/util.cpp rename to src/backend/util.cpp index 5607292c0d..7c4cd2e614 100644 --- a/src/util.cpp +++ b/src/backend/util.cpp @@ -13,9 +13,6 @@ #if defined(OS_WIN) #include -typedef HMODULE LibHandle; -#else -#include #endif using std::string; @@ -23,18 +20,18 @@ using std::string; string getEnvVar(const std::string &key) { #if defined(OS_WIN) - DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation - string retVal; - retVal.resize(bufSize); - bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize); - if (!bufSize) { - return string(""); - } else { + DWORD bufSize = 32767; // limit according to GetEnvironment Variable documentation + string retVal; retVal.resize(bufSize); - return retVal; - } + bufSize = GetEnvironmentVariable(key.c_str(), &retVal[0], bufSize); + if (!bufSize) { + return string(""); + } else { + retVal.resize(bufSize); + return retVal; + } #else - char * str = getenv(key.c_str()); - return str==NULL ? string("") : string(str); + char * str = getenv(key.c_str()); + return str==NULL ? string("") : string(str); #endif } diff --git a/src/util.hpp b/src/backend/util.hpp similarity index 100% rename from src/util.hpp rename to src/backend/util.hpp From e19a6bef84ae8c2740385167ea0f8aef2dda7c86 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 11:02:15 -0500 Subject: [PATCH 112/288] Using getEnvVar instead of getenv --- src/api/c/err_common.cpp | 7 ++++--- src/api/c/graphics_common.cpp | 5 +++-- src/backend/cuda/interopManager.cu | 9 +++++---- src/backend/cuda/memory.cpp | 8 +++++--- src/backend/cuda/platform.cpp | 5 +++-- src/backend/opencl/platform.cpp | 9 +++++---- src/backend/opencl/program.hpp | 5 +++-- 7 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp index 9ff731f79d..b9fa49221c 100644 --- a/src/api/c/err_common.cpp +++ b/src/api/c/err_common.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -155,9 +156,9 @@ static std::string global_err_string; void print_error(const string &msg) { - const char* perr = getenv("AF_PRINT_ERRORS"); - if(perr != nullptr) { - if(std::strncmp(perr, "0", 1) != 0) + std::string perr = getEnvVar("AF_PRINT_ERRORS"); + if(!perr.empty()) { + if(perr != "0") fprintf(stderr, "%s\n", msg.c_str()); } global_err_string = msg; diff --git a/src/api/c/graphics_common.cpp b/src/api/c/graphics_common.cpp index a4132b55dd..291bf84275 100644 --- a/src/api/c/graphics_common.cpp +++ b/src/api/c/graphics_common.cpp @@ -13,6 +13,7 @@ #include #include #include +#include using namespace std; @@ -145,8 +146,8 @@ fg::Window* ForgeManager::getMainWindow(const bool dontCreate) static fg::Window* wnd = NULL; // Define AF_DISABLE_GRAPHICS with any value to disable initialization - const char* noGraphicsENV = getenv("AF_DISABLE_GRAPHICS"); - if(!noGraphicsENV) { // If AF_DISABLE_GRAPHICS is not defined + std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS"); + if(!noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined if (flag && !dontCreate) { wnd = new fg::Window(WIDTH, HEIGHT, "ArrayFire", NULL, true); CheckGL("End ForgeManager::getMainWindow"); diff --git a/src/backend/cuda/interopManager.cu b/src/backend/cuda/interopManager.cu index b492a5ee1d..a6e2fcf9bd 100644 --- a/src/backend/cuda/interopManager.cu +++ b/src/backend/cuda/interopManager.cu @@ -14,6 +14,7 @@ #include #include +#include #include namespace cuda @@ -36,10 +37,10 @@ InteropManager::~InteropManager() } } catch (AfError &ex) { - const char* perr = getenv("AF_PRINT_ERRORS"); - - if(perr && perr[0] != '0') { - fprintf(stderr, "%s\n", ex.what()); + std::string perr = getEnvVar("AF_PRINT_ERRORS"); + if(!perr.empty()) { + if(perr != "0") + fprintf(stderr, "%s\n", ex.what()); } } } diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 9b3d731b4b..2632a0a3b4 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -123,9 +124,10 @@ namespace cuda } catch (AfError &ex) { - const char* perr = getenv("AF_PRINT_ERRORS"); - if(perr && perr[0] != '0') { - fprintf(stderr, "%s\n", ex.what()); + std::string perr = getEnvVar("AF_PRINT_ERRORS"); + if(!perr.empty()) { + if(perr != "0") + fprintf(stderr, "%s\n", ex.what()); } } } diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 6854535deb..f5f6599419 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -348,8 +349,8 @@ DeviceManager::DeviceManager() for(int i = 0; i < (int)MAX_DEVICES; i++) streams[i] = (cudaStream_t)0; - const char* deviceENV = getenv("AF_CUDA_DEFAULT_DEVICE"); - if(!deviceENV) { + std::string deviceENV = getEnvVar("AF_CUDA_DEFAULT_DEVICE"); + if(deviceENV.empty()) { setActiveDevice(0, cuDevices[0].nativeId); } else { stringstream s(deviceENV); diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 57726d2e87..8d77e24cbd 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -153,8 +154,8 @@ DeviceManager::DeviceManager() } } - const char* deviceENV = getenv("AF_OPENCL_DEFAULT_DEVICE"); - if(deviceENV) { + std::string deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE"); + if(!deviceENV.empty()) { std::stringstream s(deviceENV); int def_device = -1; s >> def_device; @@ -172,8 +173,8 @@ DeviceManager::DeviceManager() * OpenGL shared contexts whereever applicable */ #if defined(WITH_GRAPHICS) // Define AF_DISABLE_GRAPHICS with any value to disable initialization - const char* noGraphicsENV = getenv("AF_DISABLE_GRAPHICS"); - if(!noGraphicsENV) { // If AF_DISABLE_GRAPHICS is not defined + std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS"); + if(!noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined try { int devCount = mDevices.size(); fg::Window* wHandle = graphics::ForgeManager::getInstance().getMainWindow(); diff --git a/src/backend/opencl/program.hpp b/src/backend/opencl/program.hpp index 1b76a75ce8..6a2af45131 100644 --- a/src/backend/opencl/program.hpp +++ b/src/backend/opencl/program.hpp @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include @@ -35,8 +36,8 @@ using std::string; #if defined(NDEBUG) #define SHOW_BUILD_INFO(PROG) do { \ - const char *info = getenv("AF_OPENCL_SHOW_BUILD_INFO"); \ - if (info != nullptr && std::strncmp(info,"0", 1) != 0) { \ + std::string info = getEnvVar("AF_OPENCL_SHOW_BUILD_INFO"); \ + if (!info.empty() && info != "0") { \ SHOW_DEBUG_BUILD_INFO(prog); \ } \ } while(0) From b260abf1703e3adea569f477335ad4020e72e7da Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 11:12:24 -0500 Subject: [PATCH 113/288] Cleanup/improve backend test --- test/backend.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/backend.cpp b/test/backend.cpp index 59b8fd5129..7b8dbddade 100644 --- a/test/backend.cpp +++ b/test/backend.cpp @@ -37,10 +37,15 @@ void backendTest() { int backends = af::getAvailableBackends(); + ASSERT_NE(backends, 0); + bool cpu = backends & AF_BACKEND_CPU; bool cuda = backends & AF_BACKEND_CUDA; bool opencl = backends & AF_BACKEND_OPENCL; + printf("\nRunning Default Backend...\n"); + testFunction(); + if(cpu) { printf("\nRunning CPU Backend...\n"); af::setBackend(AF_BACKEND_CPU); From de4851d06784984ece5c476ad84e4856da4b70c2 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 11:12:47 -0500 Subject: [PATCH 114/288] Not building info for unified. backend does the same as info --- test/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d341164e82..3b7b42c87e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -120,8 +120,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) FILE(GLOB FILES "*.cpp" "*.c") LIST(SORT FILES) # Tests execute in alphabetical order -# We only build info.cpp and backend.cpp for Unified backend -SET(UNIFIED_FILES "backend.cpp;info.cpp") +# We only build backend.cpp for Unified backend +SET(UNIFIED_FILES "backend.cpp") LIST(SORT UNIFIED_FILES) # Tests execute in alphabetical order # Next we build each example using every backend. From 4d06c748f98cde6840b4cbfdc52dcb50ac53f8d3 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 12:11:51 -0500 Subject: [PATCH 115/288] FEAT Added isImageIOAvailable function to check support --- docs/details/image.dox | 6 ++++++ include/af/image.h | 24 ++++++++++++++++++++++++ src/api/c/imageio2.cpp | 12 ++++++++++++ src/api/cpp/imageio.cpp | 7 +++++++ src/api/unified/image.cpp | 5 +++++ 5 files changed, 54 insertions(+) diff --git a/docs/details/image.dox b/docs/details/image.dox index 53ac7616fc..288e4f6b0f 100644 --- a/docs/details/image.dox +++ b/docs/details/image.dox @@ -430,6 +430,12 @@ Save an array to disk as an image Supported formats include JPG, PNG, PPM and other formats supported by freeimage +\defgroup imageio_func_available isImageIoAvailable +\ingroup imageio_mat + +Returns true if ArrayFire was compiled with ImageIO (FreeImage) support + + \defgroup imagemem_func_load loadImageMem \ingroup imageio_mat diff --git a/include/af/image.h b/include/af/image.h index f38bb41694..ad56cfc081 100644 --- a/include/af/image.h +++ b/include/af/image.h @@ -147,6 +147,16 @@ AFAPI array loadImageNative(const char* filename); AFAPI void saveImageNative(const char* filename, const array& in); #endif +#if AF_API_VERSION >= 33 +/** + Function to check if Image IO is available + + \returns true if ArrayFire was commpiled with ImageIO support, false otherwise. + \ingroup imageio_func_available +*/ +AFAPI bool isImageIOAvailable(); +#endif + /** C++ Interface for resizing an image to specified dimensions @@ -794,6 +804,20 @@ extern "C" { AFAPI af_err af_save_image_native(const char* filename, const af_array in); #endif +#if AF_API_VERSION >= 33 + /** + Function to check if Image IO is available + + \param[out] out is true if ArrayFire was commpiled with ImageIO support, + false otherwise. + + \return \ref AF_SUCCESS if successful + + \ingroup imageio_func_available + */ + AFAPI af_err af_is_image_io_available(bool *out); +#endif + /** C Interface for resizing an image to specified dimensions diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp index d50afefb92..adc4244953 100644 --- a/src/api/c/imageio2.cpp +++ b/src/api/c/imageio2.cpp @@ -373,6 +373,12 @@ af_err af_save_image_native(const char* filename, const af_array in) return AF_SUCCESS; } +af_err af_is_image_io_available(bool *out) +{ + *out = true; + return AF_SUCCESS; +} + #else // WITH_FREEIMAGE #include #include @@ -386,4 +392,10 @@ af_err af_save_image_native(const char* filename, const af_array in) { AF_RETURN_ERROR("ArrayFire compiled without Image IO (FreeImage) support", AF_ERR_NOT_CONFIGURED); } + +af_err af_is_image_io_available(bool *out) +{ + *out = false; + return AF_SUCCESS; +} #endif // WITH_FREEIMAGE diff --git a/src/api/cpp/imageio.cpp b/src/api/cpp/imageio.cpp index e70b26d1d2..75ef5fe9c4 100644 --- a/src/api/cpp/imageio.cpp +++ b/src/api/cpp/imageio.cpp @@ -68,4 +68,11 @@ void saveImageNative(const char* filename, const array& in) AF_THROW(af_save_image_native(filename, in.get())); } +bool isImageIOAvailable() +{ + bool out = false; + AF_THROW(af_is_image_io_available(&out)); + return out; +} + } diff --git a/src/api/unified/image.cpp b/src/api/unified/image.cpp index d0f9aa6200..7b1159516c 100644 --- a/src/api/unified/image.cpp +++ b/src/api/unified/image.cpp @@ -55,6 +55,11 @@ af_err af_save_image_native(const char* filename, const af_array in) return CALL(filename, in); } +af_err af_is_image_io_available(bool *out) +{ + return CALL(out); +} + af_err af_resize(af_array *out, const af_array in, const dim_t odim0, const dim_t odim1, const af_interp_type method) { CHECK_ARRAYS(in); From 1b85d6d1acf795193080593e970888a01f6d0e85 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 12:12:09 -0500 Subject: [PATCH 116/288] FEAT Added isLAPACKAvailable function to check support --- docs/details/lapack.dox | 8 ++++++++ include/af/lapack.h | 25 +++++++++++++++++++++++++ include/arrayfire.h | 2 ++ src/api/c/lu.cpp | 10 ++++++++++ src/api/cpp/lapack.cpp | 7 +++++++ src/api/unified/lapack.cpp | 5 +++++ src/backend/cpu/lu.cpp | 5 +++++ src/backend/cpu/lu.hpp | 2 ++ src/backend/cuda/lu.cu | 15 +++++++++++++++ src/backend/cuda/lu.hpp | 2 ++ src/backend/opencl/lu.cpp | 10 ++++++++++ src/backend/opencl/lu.hpp | 2 ++ 12 files changed, 93 insertions(+) diff --git a/docs/details/lapack.dox b/docs/details/lapack.dox index c0d8aae5b9..522dbe544f 100644 --- a/docs/details/lapack.dox +++ b/docs/details/lapack.dox @@ -287,5 +287,13 @@ This function can return the norm using various metrics based on the type paramt =============================================================================== +\defgroup lapack_helper_func_available isLAPACKAvailable + +\ingroup lapack_helper + +\brief Returns true is ArrayFire is compiled with LAPACK support + +=============================================================================== + @} */ diff --git a/include/af/lapack.h b/include/af/lapack.h index f1cf87ad82..bb54069550 100644 --- a/include/af/lapack.h +++ b/include/af/lapack.h @@ -237,6 +237,18 @@ namespace af */ AFAPI double norm(const array &in, const normType type=AF_NORM_EUCLID, const double p=1, const double q=1); + +#if AF_API_VERSION >= 33 + /** + Returns true is ArrayFire is compiled with LAPACK support + + \returns true is LAPACK support is available, false otherwise + + \ingroup lapack_ops_func_norm + */ + AFAPI bool isLAPACKAvailable(); +#endif + } #endif @@ -425,6 +437,19 @@ extern "C" { */ AFAPI af_err af_norm(double *out, const af_array in, const af_norm_type type, const double p, const double q); +#if AF_API_VERSION >= 33 + /** + Returns true is ArrayFire is compiled with LAPACK support + + \param[out] out is true if LAPACK support is available, false otherwise + + \returns AF_SUCCESS if successful (does not depend on the value of out) + + \ingroup lapack_ops_func_norm + */ + AFAPI af_err af_is_lapack_available(bool *out); +#endif + #ifdef __cplusplus } diff --git a/include/arrayfire.h b/include/arrayfire.h index 7d9e75a7b4..73b417b3ad 100644 --- a/include/arrayfire.h +++ b/include/arrayfire.h @@ -113,6 +113,8 @@ @defgroup lapack_ops_mat Matrix operations inverse, det, rank, norm etc. + + @defgroup lapack_helper LAPACK Helper functions @} @defgroup image_mat Image Processing diff --git a/src/api/c/lu.cpp b/src/api/c/lu.cpp index c6004bc6cf..1d98e02490 100644 --- a/src/api/c/lu.cpp +++ b/src/api/c/lu.cpp @@ -95,3 +95,13 @@ af_err af_lu_inplace(af_array *pivot, af_array in, const bool is_lapack_piv) return AF_SUCCESS; } + +af_err af_is_lapack_available(bool *out) +{ + try { + *out = isLAPACKAvailable(); + } + CATCHALL; + + return AF_SUCCESS; +} diff --git a/src/api/cpp/lapack.cpp b/src/api/cpp/lapack.cpp index cf9b3ecfd2..091c807612 100644 --- a/src/api/cpp/lapack.cpp +++ b/src/api/cpp/lapack.cpp @@ -153,4 +153,11 @@ namespace af AF_THROW(af_norm(&out, in.get(), type, p, q)); return out; } + + bool isLAPACKAvailable() + { + bool out = false; + AF_THROW(af_is_lapack_available(&out)); + return out; + } } diff --git a/src/api/unified/lapack.cpp b/src/api/unified/lapack.cpp index b2364ac858..8a367017cf 100644 --- a/src/api/unified/lapack.cpp +++ b/src/api/unified/lapack.cpp @@ -96,3 +96,8 @@ af_err af_norm(double *out, const af_array in, const af_norm_type type, const do CHECK_ARRAYS(in); return CALL(out, in, type, p, q); } + +af_err af_is_lapack_available(bool *out) +{ + return CALL(out); +} diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index 93862f24c0..f8fc92de8d 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -85,6 +85,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) } } +bool isLAPACKAvailable() +{ + return true; +} + } #else diff --git a/src/backend/cpu/lu.hpp b/src/backend/cpu/lu.hpp index c25dcaaa16..3fef461067 100644 --- a/src/backend/cpu/lu.hpp +++ b/src/backend/cpu/lu.hpp @@ -17,4 +17,6 @@ namespace cpu template Array lu_inplace(Array &in, const bool convert_pivot = true); + + bool isLAPACKAvailable(); } diff --git a/src/backend/cuda/lu.cu b/src/backend/cuda/lu.cu index 2a45d4b9f5..ce0b545a84 100644 --- a/src/backend/cuda/lu.cu +++ b/src/backend/cuda/lu.cu @@ -156,6 +156,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) return pivot; } +bool isLAPACKAvailable() +{ + return true; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -186,6 +191,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) return cpu::lu_inplace(in, convert_pivot); } +bool isLAPACKAvailable() +{ + return true; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -213,6 +223,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) AF_ERR_NOT_CONFIGURED); } +bool isLAPACKAvailable() +{ + return false; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); diff --git a/src/backend/cuda/lu.hpp b/src/backend/cuda/lu.hpp index 0753129d6b..acf9dbaad7 100644 --- a/src/backend/cuda/lu.hpp +++ b/src/backend/cuda/lu.hpp @@ -17,4 +17,6 @@ namespace cuda template Array lu_inplace(Array &in, const bool convert_pivot = true); + + bool isLAPACKAvailable(); } diff --git a/src/backend/opencl/lu.cpp b/src/backend/opencl/lu.cpp index ee76f47201..2d94d4d326 100644 --- a/src/backend/opencl/lu.cpp +++ b/src/backend/opencl/lu.cpp @@ -88,6 +88,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) } } +bool isLAPACKAvailable() +{ + return true; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); @@ -116,6 +121,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) AF_ERROR("Linear Algebra is disabled on OpenCL", AF_ERR_NOT_CONFIGURED); } +bool isLAPACKAvailable() +{ + return false; +} + #define INSTANTIATE_LU(T) \ template Array lu_inplace(Array &in, const bool convert_pivot); \ template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); diff --git a/src/backend/opencl/lu.hpp b/src/backend/opencl/lu.hpp index af43f24614..b44eca8c60 100644 --- a/src/backend/opencl/lu.hpp +++ b/src/backend/opencl/lu.hpp @@ -17,4 +17,6 @@ namespace opencl template Array lu_inplace(Array &in, const bool convert_pivot = true); + + bool isLAPACKAvailable(); } From 7747ee6bf0ba1719a21faf86e142201d92b75b5f Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 12:31:33 -0500 Subject: [PATCH 117/288] Use isImageIOAvailable in testHelper --- test/testHelpers.hpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/test/testHelpers.hpp b/test/testHelpers.hpp index 758bf98e14..0b22cef283 100644 --- a/test/testHelpers.hpp +++ b/test/testHelpers.hpp @@ -390,15 +390,9 @@ bool noDoubleTests() bool noImageIOTests() { - af_array arr = 0; - const af_err err = af_load_image(&arr, TEST_DIR"/imageio/color_small.png", true); - - if(arr != 0) af_release_array(arr); - - if(err == AF_ERR_NOT_CONFIGURED) - return true; // Yes, disable test - else - return false; // No, let test continue + bool ret = !af::isImageIOAvailable(); + if(ret) printf("Image IO Not Configured. Test will exit\n"); + return ret; } bool noLAPACKTests() From fe3fa66c5cdd70cca6c53c17454402d03d057a1d Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 12:48:45 -0500 Subject: [PATCH 118/288] Use isLAPACKAvailable in testHelper --- test/testHelpers.hpp | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/test/testHelpers.hpp b/test/testHelpers.hpp index 0b22cef283..2744a8d67e 100644 --- a/test/testHelpers.hpp +++ b/test/testHelpers.hpp @@ -397,22 +397,9 @@ bool noImageIOTests() bool noLAPACKTests() { - // Run LU - af::dim4 dims(5, 5); - af_array in = 0, l = 0, u = 0, p= 0; - af_randu(&in, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits::af_type); - - af_err err = af_lu(&l, &u, &p, in); - - if(in != 0) af_release_array(in); - if(l != 0) af_release_array(l); - if(u != 0) af_release_array(u); - if(p != 0) af_release_array(p); - - if(err == AF_ERR_NOT_CONFIGURED) - return true; // Yes, disable test - else - return false; // No, let test continue + bool ret = !af::isLAPACKAvailable(); + if(ret) printf("LAPACK Not Configured. Test will exit\n"); + return ret; } // TODO: perform conversion on device for CUDA and OpenCL From b89ab5dba7487e0b30191371d7111845fa75cdde Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 13:54:04 -0500 Subject: [PATCH 119/288] Add missing af_err to string --- src/api/c/err_common.cpp | 43 ++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp index b9fa49221c..886e43ba6f 100644 --- a/src/api/c/err_common.cpp +++ b/src/api/c/err_common.cpp @@ -182,26 +182,31 @@ void af_get_last_error(char **str, dim_t *len) const char *af_err_to_string(const af_err err) { switch (err) { - case AF_SUCCESS: return "Success"; - case AF_ERR_INTERNAL: return "Internal error"; - case AF_ERR_NO_MEM: return "Device out of memory"; - case AF_ERR_DRIVER: return "Driver not available or incompatible"; - case AF_ERR_RUNTIME: return "Runtime error "; - case AF_ERR_INVALID_ARRAY: return "Invalid array"; - case AF_ERR_ARG: return "Invalid input argument"; - case AF_ERR_SIZE: return "Invalid input size"; - case AF_ERR_DIFF_TYPE: return "Input types are not the same"; - case AF_ERR_NOT_SUPPORTED: return "Function not supported"; - case AF_ERR_NOT_CONFIGURED: return "Function not configured to build"; - case AF_ERR_TYPE: return "Function does not support this data type"; - case AF_ERR_NO_DBL: return "Double precision not supported for this device"; - case AF_ERR_LOAD_LIB: return "Failed to load dynamic library. See http://www.arrayfire.com/docs/unifiedbackend.htm for instructions to set up environment for Unified backend"; - case AF_ERR_LOAD_SYM: return "Failed to load symbol"; - case AF_ERR_ARR_BKND_MISMATCH : - return "There was a mismatch between an array and the current backend"; + case AF_SUCCESS: return "Success"; + case AF_ERR_NO_MEM: return "Device out of memory"; + case AF_ERR_DRIVER: return "Driver not available or incompatible"; + case AF_ERR_RUNTIME: return "Runtime error "; + case AF_ERR_INVALID_ARRAY: return "Invalid array"; + case AF_ERR_ARG: return "Invalid input argument"; + case AF_ERR_SIZE: return "Invalid input size"; + case AF_ERR_TYPE: return "Function does not support this data type"; + case AF_ERR_DIFF_TYPE: return "Input types are not the same"; + case AF_ERR_BATCH: return "Invalid batch configuration"; + case AF_ERR_NOT_SUPPORTED: return "Function not supported"; + case AF_ERR_NOT_CONFIGURED: return "Function not configured to build"; + case AF_ERR_NONFREE: return "Function unavailable." + "ArrayFire compiled without Non-Free algorithms support"; + case AF_ERR_NO_DBL: return "Double precision not supported for this device"; + case AF_ERR_NO_GFX: return "Graphics functionality unavailable." + "ArrayFire compiled without Graphics support"; + case AF_ERR_LOAD_LIB: return "Failed to load dynamic library." + "See http://www.arrayfire.com/docs/unifiedbackend.htm" + "for instructions to set up environment for Unified backend"; + case AF_ERR_LOAD_SYM: return "Failed to load symbol"; + case AF_ERR_ARR_BKND_MISMATCH: return "There was a mismatch between an array and the current backend"; + case AF_ERR_INTERNAL: return "Internal error"; case AF_ERR_UNKNOWN: - default: - return "Unknown error"; + default: return "Unknown error"; } } From 8813a2eff1f75f056e8dc6865595100f1fd9d16a Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 16:05:12 -0500 Subject: [PATCH 120/288] af_get_last_error supports NULL as valid argument for len --- src/api/c/err_common.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp index 886e43ba6f..382dac1af1 100644 --- a/src/api/c/err_common.cpp +++ b/src/api/c/err_common.cpp @@ -166,17 +166,21 @@ print_error(const string &msg) void af_get_last_error(char **str, dim_t *len) { - *len = std::min(MAX_ERR_SIZE, (int)global_err_string.size()); + dim_t slen = std::min(MAX_ERR_SIZE, (int)global_err_string.size()); - if (*len == 0) { + if (len && slen == 0) { + *len = 0; *str = NULL; + return; } - af_alloc_host((void**)str, sizeof(char) * (*len + 1)); - global_err_string.copy(*str, *len); + af_alloc_host((void**)str, sizeof(char) * (slen + 1)); + global_err_string.copy(*str, slen); - (*str)[*len] = '\0'; + (*str)[slen] = '\0'; global_err_string = std::string(""); + + if(len) *len = slen; } const char *af_err_to_string(const af_err err) From b7af25a1b7a5b61b9ba7a3aacaac084a24867d93 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 16:06:15 -0500 Subject: [PATCH 121/288] Improvements to af::exception messages * Now prints enum string * Prints functions * Prints last error --- include/af/exception.h | 3 +++ src/api/cpp/error.hpp | 12 +++++++++--- src/api/cpp/exception.cpp | 16 ++++++++++++---- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/include/af/exception.h b/include/af/exception.h index ee10c5db7b..a43d26dbaa 100644 --- a/include/af/exception.h +++ b/include/af/exception.h @@ -27,6 +27,9 @@ class AFAPI exception : public std::exception exception(const char *msg); exception(const char *file, unsigned line, af_err err); exception(const char *msg, const char *file, unsigned line, af_err err); +#if AF_API_VERSION >= 33 + exception(const char *msg, const char *func, const char *file, unsigned line, af_err err); +#endif virtual ~exception() throw() {} virtual const char *what() const throw() { return m_msg; } friend inline std::ostream& operator<<(std::ostream &s, const exception &e) diff --git a/src/api/cpp/error.hpp b/src/api/cpp/error.hpp index 157f8193ab..c888db8646 100644 --- a/src/api/cpp/error.hpp +++ b/src/api/cpp/error.hpp @@ -8,14 +8,20 @@ ********************************************************/ #include +#include #include #define AF_THROW(fn) do { \ af_err __err = fn; \ if (__err == AF_SUCCESS) break; \ - throw af::exception(__AF_FILENAME__, __LINE__, __err); \ + char *msg = NULL; af_get_last_error(&msg, NULL);\ + af::exception ex(msg, __PRETTY_FUNCTION__, \ + __AF_FILENAME__, __LINE__, __err); \ + af_free_host(msg); \ + throw ex; \ } while(0) -#define AF_THROW_ERR(__msg, __err) do { \ - throw af::exception(__msg, __AF_FILENAME__, __LINE__, __err); \ +#define AF_THROW_ERR(__msg, __err) do { \ + throw af::exception(__msg, __PRETTY_FUNCTION__, \ + __AF_FILENAME__, __LINE__, __err); \ } while(0) diff --git a/src/api/cpp/exception.cpp b/src/api/cpp/exception.cpp index 373ae29c55..f88f98b0f2 100644 --- a/src/api/cpp/exception.cpp +++ b/src/api/cpp/exception.cpp @@ -32,8 +32,8 @@ exception::exception(const char *msg): m_err(AF_ERR_UNKNOWN) exception::exception(const char *file, unsigned line, af_err err): m_err(err) { snprintf(m_msg, sizeof(m_msg) - 1, - "ArrayFire Exception(%d): %s\nIn %s:%u", - (int)err, af_err_to_string(err), file, line); + "ArrayFire Exception (%s:%d):\nIn %s:%u", + af_err_to_string(err), (int)err, file, line); m_msg[sizeof(m_msg)-1] = '\0'; } @@ -41,11 +41,19 @@ exception::exception(const char *file, unsigned line, af_err err): m_err(err) exception::exception(const char *msg, const char *file, unsigned line, af_err err): m_err(err) { snprintf(m_msg, sizeof(m_msg) - 1, - "ArrayFire Exception(%d): %s\nIn %s:%u", - (int)(err), msg, file, line); + "ArrayFire Exception (%s:%d):\n%s\nIn %s:%u", + af_err_to_string(err), (int)(err), msg, file, line); m_msg[sizeof(m_msg)-1] = '\0'; } +exception::exception(const char *msg, const char *func, const char *file, unsigned line, af_err err): m_err(err) +{ + snprintf(m_msg, sizeof(m_msg) - 1, + "ArrayFire Exception (%s:%d):\n%s\nIn function %s\nIn file %s:%u", + af_err_to_string(err), (int)(err), msg, func, file, line); + + m_msg[sizeof(m_msg)-1] = '\0'; +} } From 7b6eee1385bc9b275e7eb3518ff454ac6faf825c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 18:17:14 -0500 Subject: [PATCH 122/288] Add version guards around allocHost and freeHost --- include/af/device.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/af/device.h b/include/af/device.h index d3585c619c..ff33b3327f 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -152,6 +152,7 @@ namespace af /// \param[in] ptr the memory to free AFAPI void freePinned(const void *ptr); +#if AF_API_VERSION >= 33 /// \brief Allocate memory on host /// /// \copydoc device_func_alloc_host @@ -162,7 +163,9 @@ namespace af /// /// \ingroup device_func_alloc_host AFAPI void *allocHost(const size_t elements, const dtype type); +#endif +#if AF_API_VERSION >= 33 /// \brief Allocate memory on host /// /// \copydoc device_func_alloc_host @@ -176,7 +179,9 @@ namespace af /// \ingroup device_func_alloc_host template AFAPI T* allocHost(const size_t elements); +#endif +#if AF_API_VERSION >= 33 /// \brief Free memory allocated internally by ArrayFire // /// \copydoc device_func_free_host @@ -185,6 +190,7 @@ namespace af /// /// \ingroup device_func_free_host AFAPI void freeHost(const void *ptr); +#endif /// \ingroup device_func_mem /// @{ @@ -291,15 +297,19 @@ extern "C" { */ AFAPI af_err af_free_pinned(void *ptr); +#if AF_API_VERSION >= 33 /** \ingroup device_func_alloc_host */ AFAPI af_err af_alloc_host(void **ptr, const dim_t bytes); +#endif +#if AF_API_VERSION >= 33 /** \ingroup device_func_free_host */ AFAPI af_err af_free_host(void *ptr); +#endif /** Create array from device memory From 960574050303c363ef26be8c8a7c05c15974904f Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 18:19:08 -0500 Subject: [PATCH 123/288] Deprecate af_(lock/unlock)_device_ptr. Use af_(lock/unlock)_array --- include/af/device.h | 28 ++++++++++++++++- src/api/c/device.cpp | 62 ++++++++++++++++++++++---------------- src/api/cpp/array.cpp | 4 +-- src/api/unified/device.cpp | 12 ++++++++ 4 files changed, 77 insertions(+), 29 deletions(-) diff --git a/include/af/device.h b/include/af/device.h index ff33b3327f..394170c624 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -346,9 +346,12 @@ extern "C" { /** Lock the device buffer in the memory manager. - Locked buffers are not freed by memory manager until \ref af_unlock_device_ptr is called. + Locked buffers are not freed by memory manager until \ref af_unlock_array is called. \ingroup device_func_mem */ +#if AF_API_VERSION >= 33 + DEPRECATED("Use af_lock_array instead") +#endif AFAPI af_err af_lock_device_ptr(const af_array arr); #endif @@ -359,9 +362,32 @@ extern "C" { This function will give back the control over the device pointer to the memory manager. \ingroup device_func_mem */ +#if AF_API_VERSION >= 33 + DEPRECATED("Use af_unlock_array instead") +#endif AFAPI af_err af_unlock_device_ptr(const af_array arr); #endif +#if AF_API_VERSION >= 33 + /** + Lock the device buffer in the memory manager. + + Locked buffers are not freed by memory manager until \ref af_unlock_array is called. + \ingroup device_func_mem + */ + AFAPI af_err af_lock_array(const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + Unlock device buffer in the memory manager. + + This function will give back the control over the device pointer to the memory manager. + \ingroup device_func_mem + */ + AFAPI af_err af_unlock_array(const af_array arr); +#endif + /** Get the device pointer and lock the buffer in memory manager. diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 84cd246a60..51eb613bfa 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -214,29 +214,34 @@ af_err af_get_device_ptr(void **data, const af_array arr) } template -inline void lockDevicePtr(const af_array arr) +inline void lockArray(const af_array arr) { memPop((const T *)getArray(arr).get()); } af_err af_lock_device_ptr(const af_array arr) +{ + return af_lock_array(arr); +} + +af_err af_lock_array(const af_array arr) { try { af_dtype type = getInfo(arr).getType(); switch (type) { - case f32: lockDevicePtr(arr); break; - case f64: lockDevicePtr(arr); break; - case c32: lockDevicePtr(arr); break; - case c64: lockDevicePtr(arr); break; - case s32: lockDevicePtr(arr); break; - case u32: lockDevicePtr(arr); break; - case s64: lockDevicePtr(arr); break; - case u64: lockDevicePtr(arr); break; - case s16: lockDevicePtr(arr); break; - case u16: lockDevicePtr(arr); break; - case u8 : lockDevicePtr(arr); break; - case b8 : lockDevicePtr(arr); break; + case f32: lockArray(arr); break; + case f64: lockArray(arr); break; + case c32: lockArray(arr); break; + case c64: lockArray(arr); break; + case s32: lockArray(arr); break; + case u32: lockArray(arr); break; + case s64: lockArray(arr); break; + case u64: lockArray(arr); break; + case s16: lockArray(arr); break; + case u16: lockArray(arr); break; + case u8 : lockArray(arr); break; + case b8 : lockArray(arr); break; default: TYPE_ERROR(4, type); } @@ -246,29 +251,34 @@ af_err af_lock_device_ptr(const af_array arr) } template -inline void unlockDevicePtr(const af_array arr) +inline void unlockArray(const af_array arr) { memPush((const T *)getArray(arr).get()); } af_err af_unlock_device_ptr(const af_array arr) +{ + return af_unlock_array(arr); +} + +af_err af_unlock_array(const af_array arr) { try { af_dtype type = getInfo(arr).getType(); switch (type) { - case f32: unlockDevicePtr(arr); break; - case f64: unlockDevicePtr(arr); break; - case c32: unlockDevicePtr(arr); break; - case c64: unlockDevicePtr(arr); break; - case s32: unlockDevicePtr(arr); break; - case u32: unlockDevicePtr(arr); break; - case s64: unlockDevicePtr(arr); break; - case u64: unlockDevicePtr(arr); break; - case s16: unlockDevicePtr(arr); break; - case u16: unlockDevicePtr(arr); break; - case u8 : unlockDevicePtr(arr); break; - case b8 : unlockDevicePtr(arr); break; + case f32: unlockArray(arr); break; + case f64: unlockArray(arr); break; + case c32: unlockArray(arr); break; + case c64: unlockArray(arr); break; + case s32: unlockArray(arr); break; + case u32: unlockArray(arr); break; + case s64: unlockArray(arr); break; + case u64: unlockArray(arr); break; + case s16: unlockArray(arr); break; + case u16: unlockArray(arr); break; + case u8 : unlockArray(arr); break; + case b8 : unlockArray(arr); break; default: TYPE_ERROR(4, type); } diff --git a/src/api/cpp/array.cpp b/src/api/cpp/array.cpp index f7931cfa9f..b993e2f7e8 100644 --- a/src/api/cpp/array.cpp +++ b/src/api/cpp/array.cpp @@ -1057,11 +1057,11 @@ af::dtype implicit_dtype(af::dtype scalar_type, af::dtype array_type) void array::lock() const { - AF_THROW(af_lock_device_ptr(get())); + AF_THROW(af_lock_array(get())); } void array::unlock() const { - AF_THROW(af_unlock_device_ptr(get())); + AF_THROW(af_unlock_array(get())); } } diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp index 8f04bf6ea1..1d5979ad13 100644 --- a/src/api/unified/device.cpp +++ b/src/api/unified/device.cpp @@ -148,6 +148,18 @@ af_err af_unlock_device_ptr(const af_array arr) return CALL(arr); } +af_err af_lock_array(const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(arr); +} + +af_err af_unlock_array(const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(arr); +} + af_err af_get_device_ptr(void **ptr, const af_array arr) { CHECK_ARRAYS(arr); From d02636a4570b17b75a78b7adca6cb83199f57e9f Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 18:21:36 -0500 Subject: [PATCH 124/288] Add memFreeUnlinked to free locked device ptrs --- src/api/c/device.cpp | 2 +- src/backend/cpu/memory.cpp | 26 +++++++++++++++++--------- src/backend/cpu/memory.hpp | 1 + src/backend/cuda/memory.cpp | 33 ++++++++++++++++++++++++--------- src/backend/cuda/memory.hpp | 1 + src/backend/opencl/memory.cpp | 32 +++++++++++++++++++++++--------- src/backend/opencl/memory.hpp | 4 +++- 7 files changed, 70 insertions(+), 29 deletions(-) diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 51eb613bfa..c2f21a273b 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -309,7 +309,7 @@ af_err af_alloc_pinned(void **ptr, const dim_t bytes) af_err af_free_device(void *ptr) { try { - memFree((char *)ptr); + memFreeUnlinked((char *)ptr, true); } CATCHALL; return AF_SUCCESS; } diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 85ba4f27fb..0e14450fad 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -156,7 +156,7 @@ T* memAlloc(const size_t &elements) } template -void memFree(T *ptr) +void memFreeUnlinked(T *ptr, bool free_unlinked) { std::lock_guard lock(memory_map_mutex); @@ -165,8 +165,9 @@ void memFree(T *ptr) if (iter != memory_map.end()) { iter->second.is_free = true; - if ((iter->second).is_unlinked) return; + if ((iter->second).is_unlinked && !free_unlinked) return; + iter->second.is_unlinked = false; used_bytes -= iter->second.bytes; used_buffers--; @@ -175,6 +176,12 @@ void memFree(T *ptr) } } +template +void memFree(T *ptr) +{ + memFreeUnlinked(ptr, false); +} + template void memPop(const T *ptr) { @@ -226,13 +233,14 @@ void pinnedFree(T* ptr) memFree(ptr); } -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template void memFreeUnlinked(T* ptr, bool free_unlinked); \ + template void memPop(const T* ptr); \ + template void memPush(const T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ INSTANTIATE(float) INSTANTIATE(cfloat) diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 0b1c960ed4..1fb8c64bbc 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -13,6 +13,7 @@ namespace cpu { template T* memAlloc(const size_t &elements); template void memFree(T* ptr); + template void memFreeUnlinked(T* ptr, bool free_unlinked); template void memPop(const T *ptr); template void memPush(const T *ptr); diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 2632a0a3b4..e7ed8ac90e 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -64,6 +64,12 @@ namespace cuda cudaFreeWrapper(ptr); // Free it because we are not sure what the size is } + template + void memFreeUnlinked(T *ptr, bool free_unlinked) + { + cudaFreeWrapper(ptr); // Free it because we are not sure what the size is + } + template void memPop(const T *ptr) { @@ -232,7 +238,7 @@ namespace cuda } template - void memFree(T *ptr) + void memFreeUnlinked(T *ptr, bool free_unlinked) { int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find((void *)ptr); @@ -240,7 +246,9 @@ namespace cuda if (iter != memory_maps[n].end()) { iter->second.is_free = true; - if ((iter->second).is_unlinked) return; + if ((iter->second).is_unlinked && !free_unlinked) return; + + iter->second.is_unlinked = false; used_bytes[n] -= iter->second.bytes; used_buffers[n]--; @@ -250,6 +258,12 @@ namespace cuda } } + template + void memFree(T *ptr) + { + memFreeUnlinked(ptr, false); + } + template void memPop(const T *ptr) { @@ -368,13 +382,14 @@ namespace cuda #endif -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template void memFreeUnlinked(T* ptr, bool free_unlinked); \ + template void memPop(const T* ptr); \ + template void memPush(const T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ INSTANTIATE(float) INSTANTIATE(cfloat) diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index 2e5fef2593..a4450f3ccf 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -13,6 +13,7 @@ namespace cuda { template T* memAlloc(const size_t &elements); template void memFree(T* ptr); + template void memFreeUnlinked(T* ptr, bool free_unlinked); template void memPop(const T *ptr); template void memPush(const T *ptr); diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index f4c740482e..7475710176 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -149,6 +149,11 @@ namespace opencl } void bufferFree(cl::Buffer *ptr) + { + bufferFreeUnlinked(ptr, false); + } + + void bufferFreeUnlinked(cl::Buffer *ptr, bool free_unlinked) { int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find(ptr); @@ -156,7 +161,9 @@ namespace opencl if (iter != memory_maps[n].end()) { iter->second.is_free = true; - if ((iter->second).is_unlinked) return; + if ((iter->second).is_unlinked && !free_unlinked) return; + + iter->second.is_unlinked = false; used_bytes[n] -= iter->second.bytes; used_buffers[n]--; @@ -212,7 +219,13 @@ namespace opencl template void memFree(T *ptr) { - return bufferFree((cl::Buffer *)ptr); + return bufferFreeUnlinked((cl::Buffer *)ptr, false); + } + + template + void memFreeUnlinked(T *ptr, bool free_unlinked) + { + return bufferFreeUnlinked((cl::Buffer *)ptr, free_unlinked); } template @@ -341,13 +354,14 @@ namespace opencl return pinnedBufferFree((void *) ptr); } -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template void memFreeUnlinked(T* ptr, bool free_unlinked); \ + template void memPop(const T* ptr); \ + template void memPush(const T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ INSTANTIATE(float) INSTANTIATE(cfloat) diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index c315a9a2f6..40e30ebce7 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -16,9 +16,11 @@ namespace opencl cl::Buffer *bufferAlloc(const size_t &bytes); void bufferFree(cl::Buffer *buf); + void bufferFreeUnlinked(cl::Buffer *buf, bool free_unlinked); template T *memAlloc(const size_t &elements); - template void memFree(T *ptr); + template void memFree(T* ptr); + template void memFreeUnlinked(T* ptr, bool free_unlinked); template void memPop(const T *ptr); template void memPush(const T *ptr); From 330f4f87df56263375f904660ad7d800d94161f5 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Dec 2015 18:22:44 -0500 Subject: [PATCH 125/288] FEAT Add printMemInfo to print memory information --- include/af/device.h | 28 ++++++++++++++++++++++ src/api/c/device.cpp | 17 +++++++++++++ src/api/cpp/device.cpp | 5 ++++ src/api/unified/device.cpp | 5 ++++ src/backend/cpu/memory.cpp | 40 +++++++++++++++++++++++++++++++ src/backend/cpu/memory.hpp | 2 ++ src/backend/cuda/memory.cpp | 45 +++++++++++++++++++++++++++++++++++ src/backend/cuda/memory.hpp | 2 ++ src/backend/opencl/memory.cpp | 41 +++++++++++++++++++++++++++++++ src/backend/opencl/memory.hpp | 2 ++ 10 files changed, 187 insertions(+) diff --git a/include/af/device.h b/include/af/device.h index 394170c624..4a3006ffc7 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -205,6 +205,19 @@ namespace af AFAPI void deviceMemInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); +#if AF_API_VERSION >= 33 + /// + /// Prints buffer details from the ArrayFire Device Manager + // + /// \param [in] msg A message to print before the table + /// \param [in] device_id print the memory info of the specified device. + /// -1 signifies active device. + // + /// \ingroup device_func_mem + /// + AFAPI void printMemInfo(const char *msg = NULL, const int device_id = -1); +#endif + /// \brief Call the garbage collection function in the memory manager /// /// \ingroup device_func_mem @@ -324,6 +337,21 @@ extern "C" { AFAPI af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); +#if AF_API_VERSION >= 33 + /// + /// Prints buffer details from the ArrayFire Device Manager + // + /// \param [in] msg A message to print before the table + /// \param [in] device_id print the memory info of the specified device. + /// -1 signifies active device. + /// + /// return AF_SUCCESS if successful + /// + /// \ingroup device_func_mem + /// + AFAPI af_err af_print_mem_info(const char *msg, const int device_id); +#endif + /** Call the garbage collection routine \ingroup device_func_mem diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index c2f21a273b..007e0ab7f2 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -17,6 +17,7 @@ #include #include #include "err_common.hpp" +#include using namespace detail; @@ -340,6 +341,22 @@ af_err af_free_host(void *ptr) return AF_SUCCESS; } +af_err af_print_mem_info(const char *msg, const int device_id) +{ + try { + int device = device_id; + if(device == -1) { + device = getActiveDeviceId(); + } + + if(msg != NULL) ARG_ASSERT(0, strlen(msg) < 256); // 256 character limit on msg + ARG_ASSERT(1, device >= 0 && device < getDeviceCount()); + + printMemInfo(msg ? msg : "", device); + } CATCHALL; + return AF_SUCCESS; +} + af_err af_device_gc() { try { diff --git a/src/api/cpp/device.cpp b/src/api/cpp/device.cpp index 3f2441732d..3b1609b9d4 100644 --- a/src/api/cpp/device.cpp +++ b/src/api/cpp/device.cpp @@ -159,6 +159,11 @@ namespace af AF_THROW(af_free_host((void *)ptr)); } + void printMemInfo(const char *msg, const int device_id) + { + AF_THROW(af_print_mem_info(msg, device_id)); + } + void deviceGC() { AF_THROW(af_device_gc()); diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp index 1d5979ad13..f7e95569c9 100644 --- a/src/api/unified/device.cpp +++ b/src/api/unified/device.cpp @@ -121,6 +121,11 @@ af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, return CALL(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers); } +af_err af_print_mem_info(const char *msg, const int device_id) +{ + return CALL(msg, device_id); +} + af_err af_device_gc() { return CALL_NO_PARAMS(); diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 0e14450fad..8718c1e30e 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include @@ -103,6 +106,43 @@ void garbageCollect() } } +void printMemInfo(const char *msg, const int device) +{ + std::cout << msg << std::endl; + + static const std::string head("| POINTER | SIZE | AF LOCK | USER LOCK |"); + static const std::string line(head.size(), '-'); + std::cout << line << std::endl << head << std::endl << line << std::endl; + + for(mem_iter iter = memory_map.begin(); + iter != memory_map.end(); ++iter) { + + std::string status_af("Unknown"); + std::string status_us("Unknown"); + + if(!(iter->second.is_free)) status_af = "Yes"; + else status_af = " No"; + + if((iter->second.is_unlinked)) status_us = "Yes"; + else status_us = " No"; + + std::string unit = "KB"; + double size = (double)(iter->second.bytes) / 1024; + if(size >= 1024) { + size = size / 1024; + unit = "MB"; + } + + std::cout << "| " << std::right << std::setw(14) << iter->first << " " + << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit + << " | " << std::setw(9) << status_af + << " | " << std::setw(9) << status_us + << " |" << std::endl; + } + + std::cout << line << std::endl; +} + template T* memAlloc(const size_t &elements) { diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 1fb8c64bbc..41a156f174 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -28,6 +28,8 @@ namespace cpu void garbageCollect(); void pinnedGarbageCollect(); + void printMemInfo(const char *msg, const int device); + void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); } diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index e7ed8ac90e..50b13019c6 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -105,6 +108,10 @@ namespace cuda { } + void printMemInfo(const char *msg, const int device) + { + std::cout << "printMemInfo() disabled in AF_CUDA_MEM_DEBUG Mode" << std::endl; + } #else // Manager Class @@ -190,6 +197,44 @@ namespace cuda } } + void printMemInfo(const char *msg, const int device) + { + std::cout << msg << std::endl; + std::cout << "Memory Map for Device: " << device << std::endl; + + static const std::string head("| POINTER | SIZE | AF LOCK | USER LOCK |"); + static const std::string line(head.size(), '-'); + std::cout << line << std::endl << head << std::endl << line << std::endl; + + for(mem_iter iter = memory_maps[device].begin(); + iter != memory_maps[device].end(); ++iter) { + + std::string status_af("Unknown"); + std::string status_us("Unknown"); + + if(!(iter->second.is_free)) status_af = "Yes"; + else status_af = " No"; + + if((iter->second.is_unlinked)) status_us = "Yes"; + else status_us = " No"; + + std::string unit = "KB"; + double size = (double)(iter->second.bytes) / 1024; + if(size >= 1024) { + size = size / 1024; + unit = "MB"; + } + + std::cout << "| " << std::right << std::setw(14) << iter->first << " " + << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit + << " | " << std::setw(9) << status_af + << " | " << std::setw(9) << status_us + << " |" << std::endl; + } + + std::cout << line << std::endl; + } + template T* memAlloc(const size_t &elements) { diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index a4450f3ccf..2d419f2a2c 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -28,6 +28,8 @@ namespace cuda void garbageCollect(); void pinnedGarbageCollect(); + void printMemInfo(const char *msg, const int device); + void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); } diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 7475710176..2c9a613754 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -10,6 +10,9 @@ #include #include #include +#include +#include +#include #include namespace opencl @@ -102,6 +105,44 @@ namespace opencl } } + void printMemInfo(const char *msg, const int device) + { + std::cout << msg << std::endl; + std::cout << "Memory Map for Device: " << device << std::endl; + + static const std::string head("| POINTER | SIZE | AF LOCK | USER LOCK |"); + static const std::string line(head.size(), '-'); + std::cout << line << std::endl << head << std::endl << line << std::endl; + + for(mem_iter iter = memory_maps[device].begin(); + iter != memory_maps[device].end(); ++iter) { + + std::string status_af("Unknown"); + std::string status_us("Unknown"); + + if(!(iter->second.is_free)) status_af = "Yes"; + else status_af = " No"; + + if((iter->second.is_unlinked)) status_us = "Yes"; + else status_us = " No"; + + std::string unit = "KB"; + double size = (double)(iter->second.bytes) / 1024; + if(size >= 1024) { + size = size / 1024; + unit = "MB"; + } + + std::cout << "| " << std::right << std::setw(14) << iter->first << " " + << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit + << " | " << std::setw(9) << status_af + << " | " << std::setw(9) << status_us + << " |" << std::endl; + } + + std::cout << line << std::endl; + } + cl::Buffer *bufferAlloc(const size_t &bytes) { int n = getActiveDeviceId(); diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index 40e30ebce7..625bd10343 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -35,6 +35,8 @@ namespace opencl void garbageCollect(); void pinnedGarbageCollect(); + void printMemInfo(const char *msg, const int device); + void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); } From ed5556c8d8d480ef42190e439e45ee2fb0165751 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Jan 2016 11:59:10 -0500 Subject: [PATCH 126/288] Renamed is_free -> mngr_lock and is_unlinked -> user_lock in cpu memory mngr --- src/backend/cpu/memory.cpp | 46 +++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 8718c1e30e..046e897455 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -60,8 +60,8 @@ static void managerInit() typedef struct { - bool is_free; - bool is_unlinked; + bool mngr_lock; // True if locked by memory manager, false if free + bool user_lock; // True if locked by user, false if free size_t bytes; } mem_info; @@ -85,9 +85,9 @@ void garbageCollect() for(mem_iter iter = memory_map.begin(); iter != memory_map.end(); ++iter) { - if ((iter->second).is_free) { + if (!(iter->second).mngr_lock) { - if (!(iter->second).is_unlinked) { + if (!(iter->second).user_lock) { freeWrapper(iter->first); total_bytes -= iter->second.bytes; } @@ -98,7 +98,7 @@ void garbageCollect() mem_iter memory_end = memory_map.end(); while(memory_curr != memory_end) { - if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) { + if (!(memory_curr->second.mngr_lock) && !memory_curr->second.user_lock) { memory_map.erase(memory_curr++); } else { ++memory_curr; @@ -117,14 +117,14 @@ void printMemInfo(const char *msg, const int device) for(mem_iter iter = memory_map.begin(); iter != memory_map.end(); ++iter) { - std::string status_af("Unknown"); - std::string status_us("Unknown"); + std::string status_mngr("Unknown"); + std::string status_user("Unknown"); - if(!(iter->second.is_free)) status_af = "Yes"; - else status_af = " No"; + if(iter->second.mngr_lock) status_mngr = "Yes"; + else status_mngr = " No"; - if((iter->second.is_unlinked)) status_us = "Yes"; - else status_us = " No"; + if(iter->second.user_lock) status_user = "Yes"; + else status_user = " No"; std::string unit = "KB"; double size = (double)(iter->second.bytes) / 1024; @@ -135,8 +135,8 @@ void printMemInfo(const char *msg, const int device) std::cout << "| " << std::right << std::setw(14) << iter->first << " " << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit - << " | " << std::setw(9) << status_af - << " | " << std::setw(9) << status_us + << " | " << std::setw(9) << status_mngr + << " | " << std::setw(9) << status_user << " |" << std::endl; } @@ -167,11 +167,11 @@ T* memAlloc(const size_t &elements) mem_info info = iter->second; - if ( info.is_free && - !info.is_unlinked && + if (!info.mngr_lock && + !info.user_lock && info.bytes == alloc_bytes) { - iter->second.is_free = false; + iter->second.mngr_lock = true; used_bytes += alloc_bytes; used_buffers++; return (T *)iter->first; @@ -185,7 +185,7 @@ T* memAlloc(const size_t &elements) AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM); } - mem_info info = {false, false, alloc_bytes}; + mem_info info = {true, false, alloc_bytes}; memory_map[ptr] = info; used_bytes += alloc_bytes; @@ -204,10 +204,10 @@ void memFreeUnlinked(T *ptr, bool free_unlinked) if (iter != memory_map.end()) { - iter->second.is_free = true; - if ((iter->second).is_unlinked && !free_unlinked) return; + iter->second.mngr_lock = false; + if ((iter->second).user_lock && !free_unlinked) return; - iter->second.is_unlinked = false; + iter->second.user_lock = false; used_bytes -= iter->second.bytes; used_buffers--; @@ -230,9 +230,9 @@ void memPop(const T *ptr) mem_iter iter = memory_map.find((void *)ptr); if (iter != memory_map.end()) { - iter->second.is_unlinked = true; + iter->second.user_lock = true; } else { - mem_info info = { false, + mem_info info = { true, true, 100 }; //This number is not relevant @@ -246,7 +246,7 @@ void memPush(const T *ptr) std::lock_guard lock(memory_map_mutex); mem_iter iter = memory_map.find((void *)ptr); if (iter != memory_map.end()) { - iter->second.is_unlinked = false; + iter->second.user_lock = false; } } From aa25b17796b046b3c85e45e367fc57d1ddad25b9 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Jan 2016 12:39:17 -0500 Subject: [PATCH 127/288] Renamed is_free -> mngr_lock and is_unlinked -> user_lock in cuda memory mngr --- src/backend/cuda/memory.cpp | 60 ++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 50b13019c6..4937ddd196 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -156,8 +156,8 @@ namespace cuda typedef struct { - bool is_free; - bool is_unlinked; + bool mngr_lock; + bool user_lock; size_t bytes; } mem_info; @@ -176,9 +176,9 @@ namespace cuda for(mem_iter iter = memory_maps[n].begin(); iter != memory_maps[n].end(); ++iter) { - if ((iter->second).is_free) { + if (!(iter->second.mngr_lock)) { - if (!(iter->second).is_unlinked) { + if (!(iter->second.user_lock)) { cudaFreeWrapper(iter->first); total_bytes[n] -= iter->second.bytes; } @@ -189,7 +189,7 @@ namespace cuda mem_iter memory_end = memory_maps[n].end(); while(memory_curr != memory_end) { - if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) { + if (!(memory_curr->second.mngr_lock) && !(memory_curr->second.user_lock)) { memory_maps[n].erase(memory_curr++); } else { ++memory_curr; @@ -209,14 +209,14 @@ namespace cuda for(mem_iter iter = memory_maps[device].begin(); iter != memory_maps[device].end(); ++iter) { - std::string status_af("Unknown"); - std::string status_us("Unknown"); + std::string status_mngr("Unknown"); + std::string status_user("Unknown"); - if(!(iter->second.is_free)) status_af = "Yes"; - else status_af = " No"; + if(iter->second.mngr_lock) status_mngr = "Yes"; + else status_mngr = " No"; - if((iter->second.is_unlinked)) status_us = "Yes"; - else status_us = " No"; + if(iter->second.user_lock) status_user = "Yes"; + else status_user = " No"; std::string unit = "KB"; double size = (double)(iter->second.bytes) / 1024; @@ -227,8 +227,8 @@ namespace cuda std::cout << "| " << std::right << std::setw(14) << iter->first << " " << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit - << " | " << std::setw(9) << status_af - << " | " << std::setw(9) << status_us + << " | " << std::setw(9) << status_mngr + << " | " << std::setw(9) << status_user << " |" << std::endl; } @@ -256,11 +256,11 @@ namespace cuda mem_info info = iter->second; - if ( info.is_free && - !info.is_unlinked && - info.bytes == alloc_bytes) { + if (!info.mngr_lock && + !info.user_lock && + info.bytes == alloc_bytes) { - iter->second.is_free = false; + iter->second.mngr_lock = true; used_bytes[n] += alloc_bytes; used_buffers[n]++; return (T *)iter->first; @@ -273,7 +273,7 @@ namespace cuda CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes)); } - mem_info info = {false, false, alloc_bytes}; + mem_info info = {true, false, alloc_bytes}; memory_maps[n][ptr] = info; used_bytes[n] += alloc_bytes; used_buffers[n]++; @@ -290,10 +290,10 @@ namespace cuda if (iter != memory_maps[n].end()) { - iter->second.is_free = true; - if ((iter->second).is_unlinked && !free_unlinked) return; + iter->second.mngr_lock = false; + if ((iter->second.user_lock) && !free_unlinked) return; - iter->second.is_unlinked = false; + iter->second.user_lock = false; used_bytes[n] -= iter->second.bytes; used_buffers[n]--; @@ -316,10 +316,10 @@ namespace cuda mem_iter iter = memory_maps[n].find((void *)ptr); if (iter != memory_maps[n].end()) { - iter->second.is_unlinked = true; + iter->second.user_lock = true; } else { - mem_info info = { false, + mem_info info = { true, true, 100 }; //This number is not relevant @@ -333,7 +333,7 @@ namespace cuda int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find((void *)ptr); if (iter != memory_maps[n].end()) { - iter->second.is_unlinked = false; + iter->second.user_lock = false; } } @@ -354,7 +354,7 @@ namespace cuda void pinnedGarbageCollect() { for(mem_iter iter = pinned_maps.begin(); iter != pinned_maps.end(); ++iter) { - if ((iter->second).is_free) { + if (!(iter->second.mngr_lock)) { pinnedFreeWrapper(iter->first); } } @@ -363,7 +363,7 @@ namespace cuda mem_iter memory_end = pinned_maps.end(); while(memory_curr != memory_end) { - if (memory_curr->second.is_free) { + if (!(memory_curr->second.mngr_lock)) { pinned_maps.erase(memory_curr++); } else { ++memory_curr; @@ -392,8 +392,8 @@ namespace cuda iter != pinned_maps.end(); ++iter) { mem_info info = iter->second; - if (info.is_free && info.bytes == alloc_bytes) { - iter->second.is_free = false; + if (!info.mngr_lock && info.bytes == alloc_bytes) { + iter->second.mngr_lock = true; pinned_used_bytes += alloc_bytes; return (T *)iter->first; } @@ -405,7 +405,7 @@ namespace cuda CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes)); } - mem_info info = {false, false, alloc_bytes}; + mem_info info = {true, false, alloc_bytes}; pinned_maps[ptr] = info; pinned_used_bytes += alloc_bytes; } @@ -418,7 +418,7 @@ namespace cuda mem_iter iter = pinned_maps.find((void *)ptr); if (iter != pinned_maps.end()) { - iter->second.is_free = true; + iter->second.mngr_lock = false; pinned_used_bytes -= iter->second.bytes; } else { pinnedFreeWrapper(ptr); // Free it because we are not sure what the size is From cef8559e520276e08b38b38861a697fbcfde2a37 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Jan 2016 12:40:21 -0500 Subject: [PATCH 128/288] Renamed is_free -> mngr_lock and is_unlinked -> user_lock in opencl memory mngr --- src/backend/opencl/memory.cpp | 60 +++++++++++++++++------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 2c9a613754..1ba4ce1cbb 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -60,8 +60,8 @@ namespace opencl typedef struct { - bool is_free; - bool is_unlinked; + bool mngr_lock; + bool user_lock; size_t bytes; } mem_info; @@ -84,9 +84,9 @@ namespace opencl for(mem_iter iter = memory_maps[n].begin(); iter != memory_maps[n].end(); ++iter) { - if ((iter->second).is_free) { + if (!(iter->second).mngr_lock) { - if (!(iter->second).is_unlinked) { + if (!(iter->second).user_lock) { destroy(iter->first); total_bytes[n] -= iter->second.bytes; } @@ -97,7 +97,7 @@ namespace opencl mem_iter memory_end = memory_maps[n].end(); while(memory_curr != memory_end) { - if (memory_curr->second.is_free && !memory_curr->second.is_unlinked) { + if (!memory_curr->second.mngr_lock && !memory_curr->second.user_lock) { memory_curr = memory_maps[n].erase(memory_curr); } else { ++memory_curr; @@ -117,14 +117,14 @@ namespace opencl for(mem_iter iter = memory_maps[device].begin(); iter != memory_maps[device].end(); ++iter) { - std::string status_af("Unknown"); - std::string status_us("Unknown"); + std::string status_mngr("Unknown"); + std::string status_user("Unknown"); - if(!(iter->second.is_free)) status_af = "Yes"; - else status_af = " No"; + if(iter->second.mngr_lock) status_mngr = "Yes"; + else status_mngr = " No"; - if((iter->second.is_unlinked)) status_us = "Yes"; - else status_us = " No"; + if(iter->second.user_lock) status_user = "Yes"; + else status_user = " No"; std::string unit = "KB"; double size = (double)(iter->second.bytes) / 1024; @@ -135,8 +135,8 @@ namespace opencl std::cout << "| " << std::right << std::setw(14) << iter->first << " " << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit - << " | " << std::setw(9) << status_af - << " | " << std::setw(9) << status_us + << " | " << std::setw(9) << status_mngr + << " | " << std::setw(9) << status_user << " |" << std::endl; } @@ -162,11 +162,11 @@ namespace opencl mem_info info = iter->second; - if ( info.is_free && - !info.is_unlinked && + if (!info.mngr_lock && + !info.user_lock && info.bytes == alloc_bytes) { - iter->second.is_free = false; + iter->second.mngr_lock = true; used_bytes[n] += alloc_bytes; used_buffers[n]++; return iter->first; @@ -180,7 +180,7 @@ namespace opencl ptr = new cl::Buffer(getContext(), CL_MEM_READ_WRITE, alloc_bytes); } - mem_info info = {false, false, alloc_bytes}; + mem_info info = {true, false, alloc_bytes}; memory_maps[n][ptr] = info; used_bytes[n] += alloc_bytes; used_buffers[n]++; @@ -201,10 +201,10 @@ namespace opencl if (iter != memory_maps[n].end()) { - iter->second.is_free = true; - if ((iter->second).is_unlinked && !free_unlinked) return; + iter->second.mngr_lock = false; + if ((iter->second).user_lock && !free_unlinked) return; - iter->second.is_unlinked = false; + iter->second.user_lock = false; used_bytes[n] -= iter->second.bytes; used_buffers[n]--; @@ -219,11 +219,11 @@ namespace opencl mem_iter iter = memory_maps[n].find(ptr); if (iter != memory_maps[n].end()) { - iter->second.is_unlinked = true; + iter->second.user_lock = true; } else { - mem_info info = { false, - false, + mem_info info = { true, + true, 100 }; //This number is not relevant memory_maps[n][ptr] = info; @@ -236,7 +236,7 @@ namespace opencl mem_iter iter = memory_maps[n].find(ptr); if (iter != memory_maps[n].end()) { - iter->second.is_unlinked = false; + iter->second.user_lock = false; } } @@ -302,7 +302,7 @@ namespace opencl { int n = getActiveDeviceId(); for(auto &iter : pinned_maps[n]) { - if ((iter.second).info.is_free) { + if (!(iter.second).info.mngr_lock) { pinnedDestroy(iter.second.buf, iter.first); } } @@ -311,7 +311,7 @@ namespace opencl pinned_iter memory_end = pinned_maps[n].end(); while(memory_curr != memory_end) { - if (memory_curr->second.info.is_free) { + if (!memory_curr->second.info.mngr_lock) { memory_curr = pinned_maps[n].erase(memory_curr); } else { ++memory_curr; @@ -341,8 +341,8 @@ namespace opencl iter != pinned_maps[n].end(); ++iter) { mem_info info = iter->second.info; - if (info.is_free && info.bytes == alloc_bytes) { - iter->second.info.is_free = false; + if (!info.mngr_lock && info.bytes == alloc_bytes) { + iter->second.info.mngr_lock = true; pinned_used_bytes += alloc_bytes; return iter->first; } @@ -360,7 +360,7 @@ namespace opencl ptr = getQueue().enqueueMapBuffer(*buf, true, CL_MAP_READ|CL_MAP_WRITE, 0, alloc_bytes); } - mem_info info = {false, false, alloc_bytes}; + mem_info info = {true, false, alloc_bytes}; pinned_info pt = {buf, info}; pinned_maps[n][ptr] = pt; pinned_used_bytes += alloc_bytes; @@ -374,7 +374,7 @@ namespace opencl pinned_iter iter = pinned_maps[n].find(ptr); if (iter != pinned_maps[n].end()) { - iter->second.info.is_free = true; + iter->second.info.mngr_lock = false; pinned_used_bytes -= iter->second.info.bytes; } else { pinnedDestroy(iter->second.buf, ptr); // Free it because we are not sure what the size is From dbe861ebea0ad871349d5659e8bbc890efacd151 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Jan 2016 12:45:54 -0500 Subject: [PATCH 129/288] Reverse conditions for freeing in memory managers --- src/backend/cpu/memory.cpp | 6 +++--- src/backend/cuda/memory.cpp | 12 ++++++------ src/backend/opencl/memory.cpp | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 046e897455..e2204eccd5 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -98,10 +98,10 @@ void garbageCollect() mem_iter memory_end = memory_map.end(); while(memory_curr != memory_end) { - if (!(memory_curr->second.mngr_lock) && !memory_curr->second.user_lock) { - memory_map.erase(memory_curr++); - } else { + if (memory_curr->second.mngr_lock || memory_curr->second.user_lock) { ++memory_curr; + } else { + memory_map.erase(memory_curr++); } } } diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 4937ddd196..52609506c0 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -189,10 +189,10 @@ namespace cuda mem_iter memory_end = memory_maps[n].end(); while(memory_curr != memory_end) { - if (!(memory_curr->second.mngr_lock) && !(memory_curr->second.user_lock)) { - memory_maps[n].erase(memory_curr++); - } else { + if (memory_curr->second.mngr_lock || memory_curr->second.user_lock) { ++memory_curr; + } else { + memory_maps[n].erase(memory_curr++); } } } @@ -363,10 +363,10 @@ namespace cuda mem_iter memory_end = pinned_maps.end(); while(memory_curr != memory_end) { - if (!(memory_curr->second.mngr_lock)) { - pinned_maps.erase(memory_curr++); - } else { + if (memory_curr->second.mngr_lock) { ++memory_curr; + } else { + pinned_maps.erase(memory_curr++); } } } diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 1ba4ce1cbb..c37ae2a4e1 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -97,10 +97,10 @@ namespace opencl mem_iter memory_end = memory_maps[n].end(); while(memory_curr != memory_end) { - if (!memory_curr->second.mngr_lock && !memory_curr->second.user_lock) { - memory_curr = memory_maps[n].erase(memory_curr); - } else { + if (memory_curr->second.mngr_lock || memory_curr->second.user_lock) { ++memory_curr; + } else { + memory_maps[n].erase(memory_curr++); } } } @@ -311,10 +311,10 @@ namespace opencl pinned_iter memory_end = pinned_maps[n].end(); while(memory_curr != memory_end) { - if (!memory_curr->second.info.mngr_lock) { - memory_curr = pinned_maps[n].erase(memory_curr); - } else { + if (memory_curr->second.info.mngr_lock) { ++memory_curr; + } else { + memory_curr = pinned_maps[n].erase(memory_curr); } } From 33fbf33e4383b610f01efe0294a246574f260b56 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Jan 2016 16:01:26 -0500 Subject: [PATCH 130/288] Renamed internal memFree functions * FreeUnlinked -> FreeLocked --- src/api/c/device.cpp | 2 +- src/backend/cpu/memory.cpp | 8 ++++---- src/backend/cpu/memory.hpp | 7 ++++++- src/backend/cuda/memory.cpp | 10 +++++----- src/backend/cuda/memory.hpp | 6 +++++- src/backend/opencl/memory.cpp | 14 +++++++------- src/backend/opencl/memory.hpp | 12 ++++++++++-- 7 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 007e0ab7f2..8f332994e7 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -310,7 +310,7 @@ af_err af_alloc_pinned(void **ptr, const dim_t bytes) af_err af_free_device(void *ptr) { try { - memFreeUnlinked((char *)ptr, true); + memFreeLocked((char *)ptr, true); } CATCHALL; return AF_SUCCESS; } diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index e2204eccd5..625f9b2416 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -196,7 +196,7 @@ T* memAlloc(const size_t &elements) } template -void memFreeUnlinked(T *ptr, bool free_unlinked) +void memFreeLocked(T *ptr, bool freeLocked) { std::lock_guard lock(memory_map_mutex); @@ -205,7 +205,7 @@ void memFreeUnlinked(T *ptr, bool free_unlinked) if (iter != memory_map.end()) { iter->second.mngr_lock = false; - if ((iter->second).user_lock && !free_unlinked) return; + if ((iter->second).user_lock && !freeLocked) return; iter->second.user_lock = false; used_bytes -= iter->second.bytes; @@ -219,7 +219,7 @@ void memFreeUnlinked(T *ptr, bool free_unlinked) template void memFree(T *ptr) { - memFreeUnlinked(ptr, false); + memFreeLocked(ptr, false); } template @@ -276,7 +276,7 @@ void pinnedFree(T* ptr) #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ - template void memFreeUnlinked(T* ptr, bool free_unlinked); \ + template void memFreeLocked(T* ptr, bool freeLocked); \ template void memPop(const T* ptr); \ template void memPush(const T* ptr); \ template T* pinnedAlloc(const size_t &elements); \ diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 41a156f174..19846c46bf 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -12,8 +12,13 @@ namespace cpu { template T* memAlloc(const size_t &elements); + + // Need these as 2 separate function and not a default argument + // This is because it is used as the deleter in shared pointer + // which cannot support default arguments template void memFree(T* ptr); - template void memFreeUnlinked(T* ptr, bool free_unlinked); + template void memFreeLocked(T* ptr, bool freeLocked); + template void memPop(const T *ptr); template void memPush(const T *ptr); diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 52609506c0..8152c8a25d 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -68,7 +68,7 @@ namespace cuda } template - void memFreeUnlinked(T *ptr, bool free_unlinked) + void memFreeLocked(T *ptr, bool freeLocked) { cudaFreeWrapper(ptr); // Free it because we are not sure what the size is } @@ -283,7 +283,7 @@ namespace cuda } template - void memFreeUnlinked(T *ptr, bool free_unlinked) + void memFreeLocked(T *ptr, bool freeLocked) { int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find((void *)ptr); @@ -291,7 +291,7 @@ namespace cuda if (iter != memory_maps[n].end()) { iter->second.mngr_lock = false; - if ((iter->second.user_lock) && !free_unlinked) return; + if ((iter->second.user_lock) && !freeLocked) return; iter->second.user_lock = false; @@ -306,7 +306,7 @@ namespace cuda template void memFree(T *ptr) { - memFreeUnlinked(ptr, false); + memFreeLocked(ptr, false); } template @@ -430,7 +430,7 @@ namespace cuda #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ - template void memFreeUnlinked(T* ptr, bool free_unlinked); \ + template void memFreeLocked(T* ptr, bool freeLocked); \ template void memPop(const T* ptr); \ template void memPush(const T* ptr); \ template T* pinnedAlloc(const size_t &elements); \ diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index 2d419f2a2c..5644a52371 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -12,8 +12,12 @@ namespace cuda { template T* memAlloc(const size_t &elements); + + // Need these as 2 separate function and not a default argument + // This is because it is used as the deleter in shared pointer + // which cannot support default arguments template void memFree(T* ptr); - template void memFreeUnlinked(T* ptr, bool free_unlinked); + template void memFreeLocked(T* ptr, bool freeLocked); template void memPop(const T *ptr); template void memPush(const T *ptr); diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index c37ae2a4e1..141610d71f 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -191,10 +191,10 @@ namespace opencl void bufferFree(cl::Buffer *ptr) { - bufferFreeUnlinked(ptr, false); + bufferFreeLocked(ptr, false); } - void bufferFreeUnlinked(cl::Buffer *ptr, bool free_unlinked) + void bufferFreeLocked(cl::Buffer *ptr, bool freeLocked) { int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find(ptr); @@ -202,7 +202,7 @@ namespace opencl if (iter != memory_maps[n].end()) { iter->second.mngr_lock = false; - if ((iter->second).user_lock && !free_unlinked) return; + if ((iter->second).user_lock && !freeLocked) return; iter->second.user_lock = false; @@ -260,13 +260,13 @@ namespace opencl template void memFree(T *ptr) { - return bufferFreeUnlinked((cl::Buffer *)ptr, false); + return bufferFreeLocked((cl::Buffer *)ptr, false); } template - void memFreeUnlinked(T *ptr, bool free_unlinked) + void memFreeLocked(T *ptr, bool freeLocked) { - return bufferFreeUnlinked((cl::Buffer *)ptr, free_unlinked); + return bufferFreeLocked((cl::Buffer *)ptr, freeLocked); } template @@ -398,7 +398,7 @@ namespace opencl #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ - template void memFreeUnlinked(T* ptr, bool free_unlinked); \ + template void memFreeLocked(T* ptr, bool freeLocked); \ template void memPop(const T* ptr); \ template void memPush(const T* ptr); \ template T* pinnedAlloc(const size_t &elements); \ diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index 625bd10343..96292cdfac 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -15,12 +15,20 @@ namespace opencl { cl::Buffer *bufferAlloc(const size_t &bytes); + + // Need these as 2 separate function and not a default argument + // This is because it is used as the deleter in shared pointer + // which cannot support default arguments void bufferFree(cl::Buffer *buf); - void bufferFreeUnlinked(cl::Buffer *buf, bool free_unlinked); + void bufferFreeLocked(cl::Buffer *buf, bool freeLocked); template T *memAlloc(const size_t &elements); + + // Need these as 2 separate function and not a default argument + // This is because it is used as the deleter in shared pointer + // which cannot support default arguments template void memFree(T* ptr); - template void memFreeUnlinked(T* ptr, bool free_unlinked); + template void memFreeLocked(T* ptr, bool freeLocked); template void memPop(const T *ptr); template void memPush(const T *ptr); From 8cb21a432c4957af96125fe844f48fe84dc5f345 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Jan 2016 21:24:54 -0500 Subject: [PATCH 131/288] Fix AF_DISABLE_GRAPHICS condition (Fixes e19a6be) --- src/api/c/graphics_common.cpp | 2 +- src/backend/opencl/platform.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/c/graphics_common.cpp b/src/api/c/graphics_common.cpp index 291bf84275..dc5a46b5e1 100644 --- a/src/api/c/graphics_common.cpp +++ b/src/api/c/graphics_common.cpp @@ -147,7 +147,7 @@ fg::Window* ForgeManager::getMainWindow(const bool dontCreate) // Define AF_DISABLE_GRAPHICS with any value to disable initialization std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS"); - if(!noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined + if(noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined if (flag && !dontCreate) { wnd = new fg::Window(WIDTH, HEIGHT, "ArrayFire", NULL, true); CheckGL("End ForgeManager::getMainWindow"); diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 8d77e24cbd..0cd46d25f6 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -174,7 +174,7 @@ DeviceManager::DeviceManager() #if defined(WITH_GRAPHICS) // Define AF_DISABLE_GRAPHICS with any value to disable initialization std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS"); - if(!noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined + if(noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined try { int devCount = mDevices.size(); fg::Window* wHandle = graphics::ForgeManager::getInstance().getMainWindow(); From c2d7e42cc21cba574b3afa65b6ffc9c3e048c342 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Jan 2016 19:19:44 -0500 Subject: [PATCH 132/288] Fix clang warnings (std::abs, pragma ignores) Fix clang warnings for abs in tests Fix clang warnings for abs in examples Fix orb maybe-initialized pragma for clang and msvc Ignore unused function warning in opencl math and ireduce Ignore missing braces warning from clang in opencl magma_helper --- examples/graphics/fractal.cpp | 3 +- .../adaptive_thresholding.cpp | 1 + .../image_processing/brain_segmentation.cpp | 10 +++-- examples/image_processing/filters.cpp | 2 +- src/api/c/assign.cpp | 2 +- src/api/c/index.cpp | 2 +- src/backend/opencl/kernel/ireduce.hpp | 14 +++++++ src/backend/opencl/kernel/orb.hpp | 37 +++++++++++++++++-- src/backend/opencl/magma/magma_helper.cpp | 15 ++++++++ src/backend/opencl/math.hpp | 15 ++++++++ test/approx1.cpp | 1 + test/approx2.cpp | 1 + test/bilateral.cpp | 1 + test/binary.cpp | 1 + test/cholesky_dense.cpp | 1 + test/convolve.cpp | 1 + test/diagonal.cpp | 1 + test/dot.cpp | 1 + test/fast.cpp | 1 + test/fft.cpp | 1 + test/fft_real.cpp | 1 + test/fftconvolve.cpp | 1 + test/getting_started.cpp | 1 + test/gloh_nonfree.cpp | 1 + test/harris.cpp | 1 + test/histogram.cpp | 1 + test/homography.cpp | 1 + test/inverse_dense.cpp | 1 + test/lu_dense.cpp | 1 + test/math.cpp | 1 + test/meanshift.cpp | 1 + test/medfilt.cpp | 1 + test/morph.cpp | 1 + test/orb.cpp | 1 + test/qr_dense.cpp | 1 + test/rank_dense.cpp | 1 + test/resize.cpp | 1 + test/rotate.cpp | 1 + test/rotate_linear.cpp | 1 + test/sift_nonfree.cpp | 1 + test/solve_dense.cpp | 1 + test/susan.cpp | 1 + test/svd_dense.cpp | 1 + test/transform.cpp | 1 + test/translate.cpp | 1 + test/transpose.cpp | 1 + test/triangle.cpp | 1 + test/wrap.cpp | 1 + 48 files changed, 128 insertions(+), 11 deletions(-) diff --git a/examples/graphics/fractal.cpp b/examples/graphics/fractal.cpp index 9ac5a86ea9..9781b61c90 100644 --- a/examples/graphics/fractal.cpp +++ b/examples/graphics/fractal.cpp @@ -10,13 +10,14 @@ #include #include #include -#include +#include #include #define WIDTH 400 // Width of image #define HEIGHT 400 // Width of image using namespace af; +using std::abs; array complex_grid(int width, int height, float zoom, float center[2]) { diff --git a/examples/image_processing/adaptive_thresholding.cpp b/examples/image_processing/adaptive_thresholding.cpp index 5ce34e76be..1004285148 100644 --- a/examples/image_processing/adaptive_thresholding.cpp +++ b/examples/image_processing/adaptive_thresholding.cpp @@ -13,6 +13,7 @@ #include using namespace af; +using std::abs; typedef enum { MEAN = 0, diff --git a/examples/image_processing/brain_segmentation.cpp b/examples/image_processing/brain_segmentation.cpp index 7349bf258b..253d37e5f1 100644 --- a/examples/image_processing/brain_segmentation.cpp +++ b/examples/image_processing/brain_segmentation.cpp @@ -23,10 +23,12 @@ const float h_sy_kernel[] = { -1, 0, 1, -2, 0, 2, -1, 0, 1 }; -const float h_lp_kernel[] = { -0.5f, -1.0f, -0.5f, - -1.0f, 6.0f, -1.0f, - -0.5f, -1.0f, -0.5f -}; + +// Unused +//const float h_lp_kernel[] = { -0.5f, -1.0f, -0.5f, +// -1.0f, 6.0f, -1.0f, +// -0.5f, -1.0f, -0.5f +//}; array edges_slice(array x) { diff --git a/examples/image_processing/filters.cpp b/examples/image_processing/filters.cpp index 8b75acf063..ae1d7c155c 100644 --- a/examples/image_processing/filters.cpp +++ b/examples/image_processing/filters.cpp @@ -151,7 +151,7 @@ array medianfilter(const array &in, int window_width, int window_height) return ret_val; } -array gaussianblur(const array &in, int window_width, int window_height, int sigma) +array gaussianblur(const array &in, int window_width, int window_height, double sigma) { array g = gaussianKernel(window_width, window_height, sigma, sigma); return convolve(in, g); diff --git a/src/api/c/assign.cpp b/src/api/c/assign.cpp index b8fcb12234..50224d32a6 100644 --- a/src/api/c/assign.cpp +++ b/src/api/c/assign.cpp @@ -125,7 +125,7 @@ af_err af_assign_seq(af_array *out, ArrayInfo lInfo = getInfo(lhs); - if (ndims == 1 && ndims != (dim_t)lInfo.ndims()) { + if (ndims == 1 && ndims != lInfo.ndims()) { af_array tmp_in, tmp_out; AF_CHECK(af_flat(&tmp_in, lhs)); AF_CHECK(af_assign_seq(&tmp_out, tmp_in, ndims, index, rhs)); diff --git a/src/api/c/index.cpp b/src/api/c/index.cpp index b6eb8ab4cd..2f5b06aa07 100644 --- a/src/api/c/index.cpp +++ b/src/api/c/index.cpp @@ -42,7 +42,7 @@ af_err af_index(af_array *result, const af_array in, const unsigned ndims, const try { ArrayInfo iInfo = getInfo(in); - if (ndims == 1 && ndims != (dim_t)iInfo.ndims()) { + if (ndims == 1 && ndims != iInfo.ndims()) { af_array tmp_in; AF_CHECK(af_flat(&tmp_in, in)); AF_CHECK(af_index(result, tmp_in, ndims, index)); diff --git a/src/backend/opencl/kernel/ireduce.hpp b/src/backend/opencl/kernel/ireduce.hpp index 0adc0c8e47..17fc460970 100644 --- a/src/backend/opencl/kernel/ireduce.hpp +++ b/src/backend/opencl/kernel/ireduce.hpp @@ -281,6 +281,14 @@ namespace kernel } } +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-function" +#else + /* Other */ +#endif + template double cabs(const T in) { return (double)in; } static double cabs(const cfloat in) { return (double)abs(in); } static double cabs(const cdouble in) { return (double)abs(in); } @@ -327,6 +335,12 @@ namespace kernel } }; +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic pop +#else + /* Other */ +#endif template T ireduce_all(uint *loc, Param in) diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp index 871370d63b..69c1176210 100644 --- a/src/backend/opencl/kernel/orb.hpp +++ b/src/backend/opencl/kernel/orb.hpp @@ -29,8 +29,24 @@ using cl::LocalSpaceArg; using cl::NDRange; using std::vector; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#if defined(__clang__) + /* Clang/LLVM */ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wsometimes-uninitialized" +#elif defined(__ICC) || defined(__INTEL_COMPILER) + /* Intel ICC/ICPC */ + // Fix the warning code here, if any +#elif defined(__GNUC__) || defined(__GNUG__) + /* GNU GCC/G++ */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#elif defined(_MSC_VER) + /* Microsoft Visual Studio */ + #pragma warning( push ) + #pragma warning( disable : 4700 ) +#else + /* Other */ +#endif namespace opencl { @@ -505,4 +521,19 @@ void orb(unsigned* out_feat, } //namespace kernel } //namespace opencl -#pragma GCC diagnostic pop + +#if defined(__clang__) + /* Clang/LLVM */ + #pragma clang diagnostic pop +#elif defined(__ICC) || defined(__INTEL_COMPILER) + /* Intel ICC/ICPC */ + // Fix the warning code here, if any +#elif defined(__GNUC__) || defined(__GNUG__) + /* GNU GCC/G++ */ + #pragma GCC diagnostic pop +#elif defined(_MSC_VER) + /* Microsoft Visual Studio */ + #pragma warning( pop ) +#else + /* Other */ +#endif diff --git a/src/backend/opencl/magma/magma_helper.cpp b/src/backend/opencl/magma/magma_helper.cpp index 584a412191..481f08c346 100644 --- a/src/backend/opencl/magma/magma_helper.cpp +++ b/src/backend/opencl/magma/magma_helper.cpp @@ -159,6 +159,14 @@ magma_int_t magma_get_geqrf_nb( magma_int_t m ) else return 128; } +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wmissing-braces" +#else + /* Other */ +#endif + template T magma_make(double r, double i) { return (T) r; } template float magma_make(double r, double i); template double magma_make(double r, double i); @@ -172,3 +180,10 @@ template<> magmaDoubleComplex magma_make(double r, double i) magmaDoubleComplex tmp = {r, i}; return tmp; } + +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic pop +#else + /* Other */ +#endif diff --git a/src/backend/opencl/math.hpp b/src/backend/opencl/math.hpp index 9292d398a0..f090062b03 100644 --- a/src/backend/opencl/math.hpp +++ b/src/backend/opencl/math.hpp @@ -17,6 +17,14 @@ #include "backend.hpp" #include "types.hpp" +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-function" +#else + /* Other */ +#endif + namespace opencl { @@ -123,3 +131,10 @@ namespace opencl cfloat operator *(cfloat a, cfloat b); cdouble operator *(cdouble a, cdouble b); } + +#if defined(__GNUC__) || defined(__GNUG__) + /* GCC/G++, Clang/LLVM, Intel ICC */ + #pragma GCC diagnostic pop +#else + /* Other */ +#endif diff --git a/test/approx1.cpp b/test/approx1.cpp index 7a6b66fce8..e7ea94e51e 100644 --- a/test/approx1.cpp +++ b/test/approx1.cpp @@ -23,6 +23,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/approx2.cpp b/test/approx2.cpp index f1a1accc51..75a650631b 100644 --- a/test/approx2.cpp +++ b/test/approx2.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/bilateral.cpp b/test/bilateral.cpp index f0825e4893..cde330dca4 100644 --- a/test/bilateral.cpp +++ b/test/bilateral.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; template diff --git a/test/binary.cpp b/test/binary.cpp index 477748792f..91ebcbc8b2 100644 --- a/test/binary.cpp +++ b/test/binary.cpp @@ -14,6 +14,7 @@ #include using namespace std; +using std::abs; using namespace af; const int num = 10000; diff --git a/test/cholesky_dense.cpp b/test/cholesky_dense.cpp index 70548d898c..7fd238d215 100644 --- a/test/cholesky_dense.cpp +++ b/test/cholesky_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/convolve.cpp b/test/convolve.cpp index f3ff9fd6ef..fff5ebffea 100644 --- a/test/convolve.cpp +++ b/test/convolve.cpp @@ -17,6 +17,7 @@ using std::vector; using std::string; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/diagonal.cpp b/test/diagonal.cpp index c88f0fbeb1..c4becab2dc 100644 --- a/test/diagonal.cpp +++ b/test/diagonal.cpp @@ -14,6 +14,7 @@ using namespace af; using std::vector; +using std::abs; template class Diagonal : public ::testing::Test diff --git a/test/dot.cpp b/test/dot.cpp index a25f59f27e..58cfbb2ed6 100644 --- a/test/dot.cpp +++ b/test/dot.cpp @@ -18,6 +18,7 @@ using std::vector; using std::string; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/fast.cpp b/test/fast.cpp index a114a8fdc6..e7df638b80 100644 --- a/test/fast.cpp +++ b/test/fast.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct diff --git a/test/fft.cpp b/test/fft.cpp index 84f0e2382e..48ff865d2a 100644 --- a/test/fft.cpp +++ b/test/fft.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/fft_real.cpp b/test/fft_real.cpp index c8d9a55ff0..8cd6612712 100644 --- a/test/fft_real.cpp +++ b/test/fft_real.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/fftconvolve.cpp b/test/fftconvolve.cpp index cd82ab20d9..ec6a3f3279 100644 --- a/test/fftconvolve.cpp +++ b/test/fftconvolve.cpp @@ -17,6 +17,7 @@ using std::vector; using std::string; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/getting_started.cpp b/test/getting_started.cpp index 12d0b6b1de..9d77af2b30 100644 --- a/test/getting_started.cpp +++ b/test/getting_started.cpp @@ -15,6 +15,7 @@ using namespace af; using std::vector; +using std::abs; TEST(GettingStarted, SNIPPET_getting_started_gen) { diff --git a/test/gloh_nonfree.cpp b/test/gloh_nonfree.cpp index a65d52ad43..052351a6fb 100644 --- a/test/gloh_nonfree.cpp +++ b/test/gloh_nonfree.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct diff --git a/test/harris.cpp b/test/harris.cpp index 276a3e357f..604e73d41c 100644 --- a/test/harris.cpp +++ b/test/harris.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct diff --git a/test/histogram.cpp b/test/histogram.cpp index f1d7af51b9..c83ba0464f 100644 --- a/test/histogram.cpp +++ b/test/histogram.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; template class Histogram : public ::testing::Test diff --git a/test/homography.cpp b/test/homography.cpp index 662b7a2a56..1bd24425be 100644 --- a/test/homography.cpp +++ b/test/homography.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; template diff --git a/test/inverse_dense.cpp b/test/inverse_dense.cpp index b0568ebbdb..1b990b6900 100644 --- a/test/inverse_dense.cpp +++ b/test/inverse_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/lu_dense.cpp b/test/lu_dense.cpp index cdb23ef962..0783fb3425 100644 --- a/test/lu_dense.cpp +++ b/test/lu_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/math.cpp b/test/math.cpp index 035ca257d2..e286e2a202 100644 --- a/test/math.cpp +++ b/test/math.cpp @@ -14,6 +14,7 @@ using namespace std; using namespace af; +using std::abs; const int num = 10000; const float flt_err = 1e-3; diff --git a/test/meanshift.cpp b/test/meanshift.cpp index 0116a5e3da..34b622be1a 100644 --- a/test/meanshift.cpp +++ b/test/meanshift.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; template diff --git a/test/medfilt.cpp b/test/medfilt.cpp index 9b4590885b..2e3a1fcb6b 100644 --- a/test/medfilt.cpp +++ b/test/medfilt.cpp @@ -17,6 +17,7 @@ using std::string; using std::vector; +using std::abs; template class MedianFilter : public ::testing::Test diff --git a/test/morph.cpp b/test/morph.cpp index d9c5282146..c42ddf0cba 100644 --- a/test/morph.cpp +++ b/test/morph.cpp @@ -18,6 +18,7 @@ using std::string; using std::vector; +using std::abs; template class Morph : public ::testing::Test diff --git a/test/orb.cpp b/test/orb.cpp index 5259366901..b499fb3824 100644 --- a/test/orb.cpp +++ b/test/orb.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct diff --git a/test/qr_dense.cpp b/test/qr_dense.cpp index 708eb5d0cd..e3809546b1 100644 --- a/test/qr_dense.cpp +++ b/test/qr_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/rank_dense.cpp b/test/rank_dense.cpp index d0a19af3b9..7f2e76db0d 100644 --- a/test/rank_dense.cpp +++ b/test/rank_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/resize.cpp b/test/resize.cpp index 6ec4e553c6..e0f1ea0810 100644 --- a/test/resize.cpp +++ b/test/resize.cpp @@ -20,6 +20,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/rotate.cpp b/test/rotate.cpp index f97cd3ab96..0d4b460033 100644 --- a/test/rotate.cpp +++ b/test/rotate.cpp @@ -20,6 +20,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/rotate_linear.cpp b/test/rotate_linear.cpp index 29a9107e4c..ce7a921260 100644 --- a/test/rotate_linear.cpp +++ b/test/rotate_linear.cpp @@ -20,6 +20,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/sift_nonfree.cpp b/test/sift_nonfree.cpp index cf1683f775..45c9462b36 100644 --- a/test/sift_nonfree.cpp +++ b/test/sift_nonfree.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct diff --git a/test/solve_dense.cpp b/test/solve_dense.cpp index bbb67409dc..09addc7c48 100644 --- a/test/solve_dense.cpp +++ b/test/solve_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/susan.cpp b/test/susan.cpp index df806c06be..591c2f01e5 100644 --- a/test/susan.cpp +++ b/test/susan.cpp @@ -20,6 +20,7 @@ using std::string; using std::vector; +using std::abs; using af::dim4; typedef struct diff --git a/test/svd_dense.cpp b/test/svd_dense.cpp index f7ef2950e0..9d4060bd7f 100644 --- a/test/svd_dense.cpp +++ b/test/svd_dense.cpp @@ -22,6 +22,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/transform.cpp b/test/transform.cpp index fa0006cbf2..1950284c2d 100644 --- a/test/transform.cpp +++ b/test/transform.cpp @@ -18,6 +18,7 @@ using std::vector; using std::string; +using std::abs; using std::cout; using std::endl; diff --git a/test/translate.cpp b/test/translate.cpp index 5b00c04ec8..355d30a553 100644 --- a/test/translate.cpp +++ b/test/translate.cpp @@ -20,6 +20,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/transpose.cpp b/test/transpose.cpp index 6be1ba49ab..8437a12615 100644 --- a/test/transpose.cpp +++ b/test/transpose.cpp @@ -17,6 +17,7 @@ using std::string; using std::vector; +using std::abs; using af::cfloat; using af::cdouble; diff --git a/test/triangle.cpp b/test/triangle.cpp index e0b609b9ab..6322070226 100644 --- a/test/triangle.cpp +++ b/test/triangle.cpp @@ -23,6 +23,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; using af::dim4; diff --git a/test/wrap.cpp b/test/wrap.cpp index 0cc6fab909..7552400db9 100644 --- a/test/wrap.cpp +++ b/test/wrap.cpp @@ -23,6 +23,7 @@ using std::vector; using std::string; using std::cout; using std::endl; +using std::abs; using af::cfloat; using af::cdouble; From 6988950605c2d075f49873c7db0827a9b8a9b323 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sun, 3 Jan 2016 05:06:36 -0500 Subject: [PATCH 133/288] Build fix for CUDA backend when using boost 1.60 --- src/backend/cuda/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index ee7b86ff2c..c2c87b83af 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -57,6 +57,8 @@ FOREACH(VER 20 30 32 35 50 52 53) ENDFOREACH() IF(UNIX) + # Forcing STRICT ANSI should resolve a bunch of issues that NVIDIA seems to face with GCC compilers. + ADD_DEFINITIONS(-D__STRICT_ANSI__) SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fvisibility=hidden) REMOVE_DEFINITIONS(-std=c++0x) IF(${WITH_COVERAGE}) From e5bb33442a95ca7e4d8e2d82e29246f3e74f4a7c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 4 Jan 2016 10:17:17 -0500 Subject: [PATCH 134/288] Add missing isLAPACKAvailable implementation in CPU backend --- src/backend/cpu/lu.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index f8fc92de8d..265fdfaec5 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -109,6 +109,11 @@ Array lu_inplace(Array &in, const bool convert_pivot) AF_ERROR("Linear Algebra is disabled on CPU", AF_ERR_NOT_CONFIGURED); } +bool isLAPACKAvailable() +{ + return false; +} + } #endif From 775747e383a3bd91204b785cc4c75daa62031520 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 4 Jan 2016 10:19:35 -0500 Subject: [PATCH 135/288] Set revision to "default" when git is not available This can happen when compiling releases which are downloaded without git files --- CMakeModules/Version.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeModules/Version.cmake b/CMakeModules/Version.cmake index 4e0ddb5a61..8d5b575399 100644 --- a/CMakeModules/Version.cmake +++ b/CMakeModules/Version.cmake @@ -32,6 +32,11 @@ EXECUTE_PROCESS( OUTPUT_STRIP_TRAILING_WHITESPACE ) +IF(NOT GIT_COMMIT_HASH) + MESSAGE(STATUS "No git. Setting hash to default") + SET(GIT_COMMIT_HASH "default") +ENDIF() + CONFIGURE_FILE( ${CMAKE_MODULE_PATH}/version.h.in ${CMAKE_SOURCE_DIR}/include/af/version.h From 84dccc841b64ee1f23d69ca30849bbdba2f9fe68 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 4 Jan 2016 11:30:13 -0500 Subject: [PATCH 136/288] Documentation fixes --- docs/pages/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/pages/README.md b/docs/pages/README.md index 302690242e..8a395a70af 100644 --- a/docs/pages/README.md +++ b/docs/pages/README.md @@ -76,7 +76,7 @@ Each ArrayFire installation comes with: ArrayFire supports batched operations on N-dimensional arrays. Batch operations in ArrayFire are run in parallel ensuring an optimal usage of your CUDA or OpenCL device. -You can get the best performance out of ArrayFire using [vectorization techniques](). +You can get the best performance out of ArrayFire using [vectorization techniques](\ref vectorization). ArrayFire can also execute loop iterations in parallel with [the gfor function](\ref gfor). @@ -92,8 +92,8 @@ Read more about how [ArrayFire JIT](http://arrayfire.com/performance-of-arrayfir ## Simple Example -Here's a live example to let you see ArrayFire code. You create [arrays](\ref -construct_mat) which reside on CUDA or OpenCL devices. Then you can use +Here's a live example to let you see ArrayFire code. You create [arrays](\ref construct_mat) +which reside on CUDA or OpenCL devices. Then you can use [ArrayFire functions](modules.htm) on those [arrays](\ref construct_mat). ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} From 3c919354a7a66f6db03552cf5b44cede9015de77 Mon Sep 17 00:00:00 2001 From: pradeep Date: Mon, 4 Jan 2016 14:31:42 -0500 Subject: [PATCH 137/288] Replaced ssh based url with http url for threads submodule --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 1d89315347..c91b7f1585 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,4 +9,4 @@ url = https://chromium.googlesource.com/external/googletest [submodule "src/backend/cpu/threads"] path = src/backend/cpu/threads - url = git@github.com:alltheflops/threads.git + url = https://github.com/alltheflops/threads.git From d9e5288006a1cafdb1e0a26ba3b063e52f65a554 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 6 Jan 2016 13:05:39 -0500 Subject: [PATCH 138/288] Fix NONFREE Build CMake Options * NONFREE becomes the parent flag. If off, then child flags are unset * When NONFREE is on, each child flag can be set to on or off * Changed child (SIFT) flag to be BUILD_NONFREE_SIFT * Changed child (SIFT) define to be AF_BUILD_NONFREE_SIFT * Made the changes in test as well --- CMakeLists.txt | 24 +++++++++++------------- src/api/c/sift.cpp | 4 ++-- src/backend/cpu/sift.cpp | 4 ++-- src/backend/cuda/sift.cu | 4 ++-- src/backend/opencl/sift.cpp | 4 ++-- test/CMakeLists.txt | 22 ++++++++++++++++++++-- test/gloh_nonfree.cpp | 4 ++-- test/sift_nonfree.cpp | 4 ++-- 8 files changed, 43 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c79fbcaab0..61a78a635f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,9 +31,6 @@ OPTION(BUILD_DOCS "Create ArrayFire Documentation" OFF) OPTION(WITH_COVERAGE "Added code coverage flags" OFF) OPTION(BUILD_NONFREE "Build ArrayFire nonfree algorithms" OFF) -OPTION(BUILD_SIFT "Build ArrayFire nonfree algorithms" OFF) - -MARK_AS_ADVANCED(BUILD_SIFT) OPTION(BUILD_UNIFIED "Build Backend-Independent ArrayFire API" ON) @@ -91,17 +88,18 @@ IF(BUILD_GRAPHICS) ENDIF(BUILD_GRAPHICS) -IF(BUILD_NONFREE) - MESSAGE(WARNING "Building With NONFREE ON requires the following patents") - SET(BUILD_SIFT ON) -ENDIF(BUILD_NONFREE) +IF(${BUILD_NONFREE}) + MESSAGE(WARNING "Building With NONFREE ON requires the following patents") + SET(BUILD_NONFREE_SIFT ON CACHE BOOL "Build ArrayFire with SIFT") + MARK_AS_ADVANCED(BUILD_NONFREE_SIFT) +ELSE(${BUILD_NONFREE}) + UNSET(BUILD_NONFREE_SIFT CACHE) # BUILD_NONFREE_SIFT cannot be built without BUILD_NONFREE +ENDIF(${BUILD_NONFREE}) -IF(BUILD_SIFT) - ADD_DEFINITIONS(-DAF_BUILD_SIFT) +IF(${BUILD_NONFREE_SIFT}) + ADD_DEFINITIONS(-DAF_BUILD_NONFREE_SIFT) - IF (NOT BUILD_NONFREE) - MESSAGE(WARNING "Building with SIFT requires the following patents") - ENDIF() + MESSAGE(WARNING "Building with SIFT requires the following patents") MESSAGE("Method and apparatus for identifying scale invariant features" "in an image and use of same for locating an object in an image,\" David" @@ -110,7 +108,7 @@ IF(BUILD_SIFT) "further details, contact David Lowe (lowe@cs.ubc.ca) or the" "University-Industry Liaison Office of the University of British" "Columbia.") -ENDIF(BUILD_SIFT) +ENDIF(${BUILD_NONFREE_SIFT}) INCLUDE_DIRECTORIES( "${CMAKE_CURRENT_SOURCE_DIR}/include" diff --git a/src/api/c/sift.cpp b/src/api/c/sift.cpp index c7a38582aa..a14badc88d 100644 --- a/src/api/c/sift.cpp +++ b/src/api/c/sift.cpp @@ -54,7 +54,7 @@ af_err af_sift(af_features* feat, af_array* desc, const af_array in, const unsig const bool double_input, const float img_scale, const float feature_ratio) { try { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT ArrayInfo info = getInfo(in); af::dim4 dims = info.dims(); @@ -95,7 +95,7 @@ af_err af_gloh(af_features* feat, af_array* desc, const af_array in, const unsig const bool double_input, const float img_scale, const float feature_ratio) { try { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT ArrayInfo info = getInfo(in); af::dim4 dims = info.dims(); diff --git a/src/backend/cpu/sift.cpp b/src/backend/cpu/sift.cpp index 4b20f8ab49..0345e37485 100644 --- a/src/backend/cpu/sift.cpp +++ b/src/backend/cpu/sift.cpp @@ -21,7 +21,7 @@ #include #include -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT #include #endif @@ -39,7 +39,7 @@ unsigned sift(Array& x, Array& y, Array& score, const float img_scale, const float feature_ratio, const bool compute_GLOH) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT return sift_impl(x, y, score, ori, size, desc, in, n_layers, contrast_thr, edge_thr, init_sigma, double_input, img_scale, feature_ratio, compute_GLOH); diff --git a/src/backend/cuda/sift.cu b/src/backend/cuda/sift.cu index f3d36d7dfb..ad668af924 100644 --- a/src/backend/cuda/sift.cu +++ b/src/backend/cuda/sift.cu @@ -15,7 +15,7 @@ #include #include -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT #include #endif @@ -34,7 +34,7 @@ unsigned sift(Array& x, Array& y, Array& score, const float img_scale, const float feature_ratio, const bool compute_GLOH) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT const dim4 dims = in.dims(); unsigned nfeat_out; diff --git a/src/backend/opencl/sift.cpp b/src/backend/opencl/sift.cpp index 5bd940d127..632647ca19 100644 --- a/src/backend/opencl/sift.cpp +++ b/src/backend/opencl/sift.cpp @@ -15,7 +15,7 @@ #include #include -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT #include #endif @@ -34,7 +34,7 @@ unsigned sift(Array& x_out, Array& y_out, Array& score_out, const float img_scale, const float feature_ratio, const bool compute_GLOH) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT unsigned nfeat_out; unsigned desc_len; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3b7b42c87e..1bcdde95af 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,10 +18,28 @@ ELSE() FIND_PACKAGE(ArrayFire REQUIRED) INCLUDE_DIRECTORIES(${ArrayFire_INCLUDE_DIRS}) OPTION(BUILD_NONFREE "Build Tests for nonfree algorithms" OFF) - IF(${BUILD_NONFREE}) # Add definition. Not required when building with AF - ADD_DEFINITIONS(-DAF_BUILD_SIFT) + + IF(${BUILD_NONFREE}) + MESSAGE(WARNING "Building With NONFREE ON requires the following patents") + SET(BUILD_NONFREE_SIFT ON CACHE BOOL "Build ArrayFire with SIFT") + ELSE(${BUILD_NONFREE}) + UNSET(BUILD_NONFREE_SIFT CACHE) # BUILD_NONFREE_SIFT cannot be built without BUILD_NONFREE ENDIF(${BUILD_NONFREE}) + IF(${BUILD_NONFREE_SIFT}) + ADD_DEFINITIONS(-DAF_BUILD_NONFREE_SIFT) + + MESSAGE(WARNING "Building with SIFT requires the following patents") + + MESSAGE("Method and apparatus for identifying scale invariant features" + "in an image and use of same for locating an object in an image,\" David" + "G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application" + "filed March 8, 1999. Asignee: The University of British Columbia. For" + "further details, contact David Lowe (lowe@cs.ubc.ca) or the" + "University-Industry Liaison Office of the University of British" + "Columbia.") + ENDIF(${BUILD_NONFREE_SIFT}) + # ENABLE_TESTING is required when building only tests # When building from source, enable_testing is picked from from the main # CMakeLists.txt diff --git a/test/gloh_nonfree.cpp b/test/gloh_nonfree.cpp index 052351a6fb..f50e4031aa 100644 --- a/test/gloh_nonfree.cpp +++ b/test/gloh_nonfree.cpp @@ -158,7 +158,7 @@ TYPED_TEST_CASE(GLOH, TestTypes); template void glohTest(string pTestFile) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT if (noDoubleTests()) return; if (noImageIOTests()) return; @@ -270,7 +270,7 @@ void glohTest(string pTestFile) // TEST(GLOH, CPP) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT if (noDoubleTests()) return; if (noImageIOTests()) return; diff --git a/test/sift_nonfree.cpp b/test/sift_nonfree.cpp index 45c9462b36..2e069fd3d3 100644 --- a/test/sift_nonfree.cpp +++ b/test/sift_nonfree.cpp @@ -158,7 +158,7 @@ TYPED_TEST_CASE(SIFT, TestTypes); template void siftTest(string pTestFile, unsigned nLayers, float contrastThr, float edgeThr, float initSigma, bool doubleInput) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT if (noDoubleTests()) return; if (noImageIOTests()) return; @@ -276,7 +276,7 @@ void siftTest(string pTestFile, unsigned nLayers, float contrastThr, float edgeT // TEST(SIFT, CPP) { -#ifdef AF_BUILD_SIFT +#ifdef AF_BUILD_NONFREE_SIFT if (noDoubleTests()) return; if (noImageIOTests()) return; From 5be5511fc22ceb6b38a2d19fa7ad158d2aeede4d Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 6 Jan 2016 14:52:16 -0500 Subject: [PATCH 139/288] Handle compute_53 (tegra x1) for cuda lapack --- src/backend/cuda/CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index fc9a75cb12..7bcc133407 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -70,9 +70,9 @@ ENDIF() ADD_DEFINITIONS(-DAF_CUDA) -IF(${CUDA_VERSION_MAJOR} LESS 7) +IF(${CUDA_VERSION_MAJOR} LESS 7 OR ${CUDA_COMPUTE_53}) # Use CPU Lapack as fallback? - OPTION(CUDA_LAPACK_CPU_FALLBACK "Use CPU LAPACK as fallback for CUDA LAPACK when CUDA is 6.5 or older" OFF) + OPTION(CUDA_LAPACK_CPU_FALLBACK "Use CPU LAPACK as fallback for CUDA LAPACK when cusolver is not available" OFF) MARK_AS_ADVANCED(CUDA_LAPACK_CPU_FALLBACK) IF(${CUDA_LAPACK_CPU_FALLBACK}) @@ -84,9 +84,9 @@ IF(${CUDA_VERSION_MAJOR} LESS 7) ENDIF(APPLE) IF(NOT LAPACK_FOUND) - MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.") + MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. Linear Algebra will not be available.") ELSE(NOT LAPACK_FOUND) - MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. But CPU LAPACK libraries are available. Will fallback to using host side code.") + MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. But CPU LAPACK libraries are available. Will fallback to using host side code.") ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA) IF(USE_CUDA_MKL) MESSAGE("Using MKL") @@ -94,13 +94,13 @@ IF(${CUDA_VERSION_MAJOR} LESS 7) ENDIF() ENDIF() ELSE() - MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cuSolve library. Linear Algebra will not be available.") + MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. Linear Algebra will not be available.") ENDIF() IF(CMAKE_VERSION VERSION_LESS 3.2) SET(CUDA_cusolver_LIBRARY) MARK_AS_ADVANCED(CUDA_cusolver_LIBRARY) ENDIF(CMAKE_VERSION VERSION_LESS 3.2) -ELSE(${CUDA_VERSION_MAJOR} LESS 7) +ELSE(${CUDA_VERSION_MAJOR} LESS 7 OR ${CUDA_COMPUTE_53}) MESSAGE(STATUS "CUDA cusolver library available in CUDA Version ${CUDA_VERSION_STRING}") ADD_DEFINITIONS(-DWITH_CUDA_LINEAR_ALGEBRA) IF(CMAKE_VERSION VERSION_LESS 3.2) @@ -113,7 +113,7 @@ ELSE(${CUDA_VERSION_MAJOR} LESS 7) NO_DEFAULT_PATH ) ENDIF(CMAKE_VERSION VERSION_LESS 3.2) -ENDIF(${CUDA_VERSION_MAJOR} LESS 7) +ENDIF(${CUDA_VERSION_MAJOR} LESS 7 OR ${CUDA_COMPUTE_53}) INCLUDE_DIRECTORIES( ${CMAKE_INCLUDE_PATH} From cc00f35930e44484553c00ec679f49e3d5595db4 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 6 Jan 2016 15:28:36 -0500 Subject: [PATCH 140/288] Add definition for each compute type --- src/backend/cuda/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index 7bcc133407..fee9c78dad 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -44,6 +44,7 @@ MACRO(SET_COMPUTE VERSION) SET(CUDA_GENERATE_CODE_${VERSION} "-gencode arch=compute_${VERSION},code=sm_${VERSION}") SET(CUDA_GENERATE_CODE ${CUDA_GENERATE_CODE} ${CUDA_GENERATE_CODE_${VERSION}}) LIST(APPEND COMPUTE_VERSIONS "${VERSION}") + ADD_DEFINITIONS(-DCUDA_COMPUTE_${VERSION}) MESSAGE(STATUS "Setting Compute ${VERSION} to ON") ENDMACRO(SET_COMPUTE) From a1823b3efd981f008bc1884dcd22935f4467cf9c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 7 Jan 2016 14:44:09 -0500 Subject: [PATCH 141/288] Added helper functions for device type and unified mem in OpenCL --- src/backend/opencl/platform.cpp | 24 ++++++++++++++++++++++++ src/backend/opencl/platform.hpp | 6 ++++++ 2 files changed, 30 insertions(+) diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 0cd46d25f6..005f2c1189 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -256,6 +256,10 @@ std::string getInfo() << (device.getInfo()>0 ? "True" : "False") << ")"; #endif + // TODO Move this inside debug + info << "Unified Memory(" + << (isHostUnifiedMemory(device) ? "True" : "False") + << ")"; info << std::endl; nDevices++; @@ -311,6 +315,26 @@ const cl::Device& getDevice() return *(devMngr.mDevices[devMngr.mActiveQId]); } +cl_device_type getDeviceType() +{ + cl::Device device = getDevice(); + cl_device_type type = device.getInfo(); + return type; +} + +bool isHostUnifiedMemory(const cl::Device &device) +{ + return device.getInfo(); +} + +bool OpenCLCPUOffload() +{ + static const bool sync = getEnvVar("AF_OPENCL_CPU_OFFLOAD") == "1"; + bool offload = false; + if(sync) offload = isHostUnifiedMemory(getDevice()); + return offload; +} + bool isGLSharingSupported() { DeviceManager& devMngr = DeviceManager::getInstance(); diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 84cb7b854c..85c533fa84 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -97,6 +97,12 @@ cl::CommandQueue& getQueue(); const cl::Device& getDevice(); +cl_device_type getDeviceType(); + +bool isHostUnifiedMemory(const cl::Device &device); + +bool OpenCLCPUOffload(); + bool isGLSharingSupported(); bool isDoubleSupported(int device); From 4275f5f2dda6a089fcbae77418204a5c1c76a2f3 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 7 Jan 2016 15:08:51 -0500 Subject: [PATCH 142/288] Added getMappedPtr and unmapPtr functions in opencl memory --- src/backend/opencl/memory.cpp | 26 ++++++++++++++++++++++++++ src/backend/opencl/memory.hpp | 3 +++ 2 files changed, 29 insertions(+) diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 141610d71f..924e370a64 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -281,6 +281,29 @@ namespace opencl return bufferPush((cl::Buffer *)ptr); } + template + T *getMappedPtr(const cl::Buffer *buf) + { + int n = getActiveDeviceId(); + mem_iter iter = memory_maps[n].find(const_cast(buf)); + + if (iter == memory_maps[n].end()) { + // Buffer not found in memory manager + // Very Very Bad + return NULL; + } + size_t alloc_bytes = iter->second.bytes; + + T *ptr = (T*)getQueue().enqueueMapBuffer( + *buf, true, CL_MAP_READ, 0, alloc_bytes); + return ptr; + } + + void unmapPtr(const cl::Buffer *buf, void *ptr) + { + getQueue().enqueueUnmapMemObject(*buf, ptr); + } + // pinned memory manager typedef struct { cl::Buffer *buf; @@ -403,6 +426,7 @@ namespace opencl template void memPush(const T* ptr); \ template T* pinnedAlloc(const size_t &elements); \ template void pinnedFree(T* ptr); \ + template T* getMappedPtr(const cl::Buffer *buf); \ INSTANTIATE(float) INSTANTIATE(cfloat) @@ -416,4 +440,6 @@ namespace opencl INSTANTIATE(uintl) INSTANTIATE(short) INSTANTIATE(ushort) + + template void* getMappedPtr(const cl::Buffer *buf); } diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index 96292cdfac..f337a7a1bd 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -32,6 +32,9 @@ namespace opencl template void memPop(const T *ptr); template void memPush(const T *ptr); + template T *getMappedPtr(const cl::Buffer *buf); + void unmapPtr(const cl::Buffer *buf, void *ptr); + template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); From 3c1ab9f0902a37bd7b3c31bc533790d950407b52 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 7 Jan 2016 16:10:43 -0500 Subject: [PATCH 143/288] Added matmul offloading to CPU --- src/backend/cpu/blas.hpp | 6 + src/backend/opencl/CMakeLists.txt | 12 ++ src/backend/opencl/blas.cpp | 6 + src/backend/opencl/cpu/cpu_blas.cpp | 268 ++++++++++++++++++++++++++ src/backend/opencl/cpu/cpu_blas.hpp | 23 +++ src/backend/opencl/cpu/cpu_helper.hpp | 46 +++++ test/blas.cpp | 1 + 7 files changed, 362 insertions(+) create mode 100644 src/backend/opencl/cpu/cpu_blas.cpp create mode 100644 src/backend/opencl/cpu/cpu_blas.hpp create mode 100644 src/backend/opencl/cpu/cpu_helper.hpp diff --git a/src/backend/cpu/blas.hpp b/src/backend/cpu/blas.hpp index 117d3a2145..934a2c6ec7 100644 --- a/src/backend/cpu/blas.hpp +++ b/src/backend/cpu/blas.hpp @@ -45,4 +45,10 @@ template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs); +typedef std::complex cfloat; +typedef std::complex cdouble; + +template struct is_complex { static const bool value = false; }; +template<> struct is_complex { static const bool value = true; }; +template<> struct is_complex { static const bool value = true; }; } diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index 86ba1b2aad..c9c47d0198 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -123,6 +123,12 @@ FILE(GLOB conv_ker_headers FILE(GLOB conv_ker_sources "kernel/convolve/*.cpp") +FILE(GLOB cpu_headers + "cpu/*.hpp") + +FILE(GLOB cpu_sources + "cpu/*.cpp") + source_group(backend\\opencl\\Headers FILES ${opencl_headers}) source_group(backend\\opencl\\Sources FILES ${opencl_sources}) source_group(backend\\opencl\\JIT FILES ${jit_sources}) @@ -131,6 +137,8 @@ source_group(backend\\opencl\\kernel\\cl FILES ${opencl_kernels}) source_group(backend\\opencl\\kernel\\Sources FILES ${kernel_sources}) source_group(backend\\opencl\\kernel\\convolve\\Headers FILES ${conv_ker_headers}) source_group(backend\\opencl\\kernel\\convolve\\Sources FILES ${conv_ker_sources}) +source_group(backend\\opencl\\cpu\\Headers FILES ${cpu_headers}) +source_group(backend\\opencl\\cpu\\Sources FILES ${cpu_sources}) IF(LAPACK_FOUND) FILE(GLOB magma_sources @@ -206,6 +214,8 @@ IF(DEFINED BLAS_SYM_FILE) ${kernel_sources} ${conv_ker_headers} ${conv_ker_sources} + ${cpu_headers} + ${cpu_sources} ${backend_headers} ${backend_sources} ${magma_sources} @@ -244,6 +254,8 @@ ELSE(DEFINED BLAS_SYM_FILE) ${kernel_sources} ${conv_ker_headers} ${conv_ker_sources} + ${cpu_sources} + ${cpu_sources} ${backend_headers} ${backend_sources} ${c_headers} diff --git a/src/backend/opencl/blas.cpp b/src/backend/opencl/blas.cpp index 6173a684ea..f9f8af1253 100644 --- a/src/backend/opencl/blas.cpp +++ b/src/backend/opencl/blas.cpp @@ -20,6 +20,8 @@ #include #include +#include + namespace opencl { @@ -113,6 +115,10 @@ template Array matmul(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { + if(OpenCLCPUOffload()) { + return cpu::matmul(lhs, rhs, optLhs, optRhs); + } + initBlas(); clblasTranspose lOpts = toClblasTranspose(optLhs); clblasTranspose rOpts = toClblasTranspose(optRhs); diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp new file mode 100644 index 0000000000..524777a6c6 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_blas.cpp @@ -0,0 +1,268 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +using std::add_const; +using std::add_pointer; +using std::enable_if; +using std::is_floating_point; +using std::remove_const; +using std::conditional; + +// Some implementations of BLAS require void* for complex pointers while others use float*/double* +// +// Sample cgemm API +// OpenBLAS +// void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, +// OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, +// OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, +// OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, +// float *C, OPENBLAS_CONST blasint ldc); +// +// MKL +// void cblas_cgemm(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, +// const MKL_INT M, const MKL_INT N, const MKL_INT K, +// const void *alpha, const void *A, const MKL_INT lda, +// const void *B, const MKL_INT ldb, const void *beta, +// void *C, const MKL_INT ldc); +// atlas cblas +// void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, +// const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, +// const void *alpha, const void *A, const int lda, +// const void *B, const int ldb, const void *beta, +// void *C, const int ldc); +// +// LAPACKE +// void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, +// const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, +// const void *alpha, const void *A, const int lda, +// const void *B, const int ldb, const void *beta, +// void *C, const int ldc); +#if defined(IS_OPENBLAS) + static const bool cplx_void_ptr = false; +#else + static const bool cplx_void_ptr = true; +#endif + +template +struct blas_base { + using type = typename dtype_traits::base_type; +}; + +template +struct blas_base ::value && cplx_void_ptr>::type> { + using type = void; +}; + + +template +using cptr_type = typename conditional< is_complex::value, + const typename blas_base::type *, + const T*>::type; +template +using ptr_type = typename conditional< is_complex::value, + typename blas_base::type *, + T*>::type; +template +using scale_type = typename conditional< is_complex::value, + const typename blas_base::type *, + const T>::type; + +template +using gemm_func_def = void (*)( const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE, + const blasint, const blasint, const blasint, + scale_type, cptr_type, const blasint, + cptr_type, const blasint, + scale_type, ptr_type, const blasint); + +template +using gemv_func_def = void (*)( const CBLAS_ORDER, const CBLAS_TRANSPOSE, + const blasint, const blasint, + scale_type, cptr_type, const blasint, + cptr_type, const blasint, + scale_type, ptr_type, const blasint); + +#define BLAS_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + +#define BLAS_FUNC( FUNC, TYPE, PREFIX ) \ + template<> FUNC##_func_def FUNC##_func() \ +{ return &cblas_##PREFIX##FUNC; } + +BLAS_FUNC_DEF( gemm ) +BLAS_FUNC(gemm , float , s) +BLAS_FUNC(gemm , double , d) +BLAS_FUNC(gemm , cfloat , c) +BLAS_FUNC(gemm , cdouble , z) + +BLAS_FUNC_DEF(gemv) +BLAS_FUNC(gemv , float , s) +BLAS_FUNC(gemv , double , d) +BLAS_FUNC(gemv , cfloat , c) +BLAS_FUNC(gemv , cdouble , z) + +template +typename enable_if::value, scale_type>::type +getScale() { return T(value); } + +template +typename enable_if::value, scale_type>::type +getScale() +{ + static T val = scalar(value); + return (const typename blas_base::type *)&val; +} + +CBLAS_TRANSPOSE +toCblasTranspose(af_mat_prop opt) +{ + CBLAS_TRANSPOSE out = CblasNoTrans; + switch(opt) { + case AF_MAT_NONE : out = CblasNoTrans; break; + case AF_MAT_TRANS : out = CblasTrans; break; + case AF_MAT_CTRANS : out = CblasConjTrans; break; + default : AF_ERROR("INVALID af_mat_prop", AF_ERR_ARG); + } + return out; +} + +template +Array matmul(const Array &lhs, const Array &rhs, + af_mat_prop optLhs, af_mat_prop optRhs) +{ + CBLAS_TRANSPOSE lOpts = toCblasTranspose(optLhs); + CBLAS_TRANSPOSE rOpts = toCblasTranspose(optRhs); + + int aRowDim = (lOpts == CblasNoTrans) ? 0 : 1; + int aColDim = (lOpts == CblasNoTrans) ? 1 : 0; + int bColDim = (rOpts == CblasNoTrans) ? 1 : 0; + + dim4 lDims = lhs.dims(); + dim4 rDims = rhs.dims(); + int M = lDims[aRowDim]; + int N = rDims[bColDim]; + int K = lDims[aColDim]; + + //FIXME: Leaks on errors. + Array out = createValueArray(af::dim4(M, N, 1, 1), scalar(0)); + auto alpha = getScale(); + auto beta = getScale(); + + dim4 lStrides = lhs.strides(); + dim4 rStrides = rhs.strides(); + using BT = typename blas_base::type; + using CBT = const typename blas_base::type; + + // get host pointers from mapped memory + BT *lPtr = getMappedPtr(lhs.get()); + BT *rPtr = getMappedPtr(rhs.get()); + BT *oPtr = getMappedPtr(out.get()); + + if(rDims[bColDim] == 1) { + N = lDims[aColDim]; + gemv_func()( + CblasColMajor, lOpts, + lDims[0], lDims[1], + alpha, + lPtr, lStrides[1], + rPtr, rStrides[0], + beta, + oPtr, 1); + } else { + gemm_func()( + CblasColMajor, lOpts, rOpts, + M, N, K, + alpha, + lPtr, lStrides[1], + rPtr, rStrides[1], + beta, + oPtr, out.dims()[0]); + } + + unmapPtr(lhs.get(), lPtr); + unmapPtr(rhs.get(), rPtr); + unmapPtr(out.get(), oPtr); + + return out; +} + +//template T +//conj(T x) { return x; } +// +//template<> cfloat conj (cfloat c) { return std::conj(c); } +//template<> cdouble conj(cdouble c) { return std::conj(c); } +// +//template +//Array dot_(const Array &lhs, const Array &rhs, +// af_mat_prop optLhs, af_mat_prop optRhs) +//{ +// int N = lhs.dims()[0]; +// +// T out = 0; +// const T *pL = lhs.get(); +// const T *pR = rhs.get(); +// +// for(int i = 0; i < N; i++) +// out += (conjugate ? cpu::conj(pL[i]) : pL[i]) * pR[i]; +// +// if(both_conjugate) out = cpu::conj(out); +// +// return createValueArray(af::dim4(1), out); +//} +// +//template +//Array dot(const Array &lhs, const Array &rhs, +// af_mat_prop optLhs, af_mat_prop optRhs) +//{ +// if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { +// return dot_(lhs, rhs, optLhs, optRhs); +// } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { +// return dot_(lhs, rhs, optLhs, optRhs); +// } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { +// return dot_(rhs, lhs, optRhs, optLhs); +// } else { +// return dot_(lhs, rhs, optLhs, optRhs); +// } +//} + +#undef BT +#undef REINTEPRET_CAST + +#define INSTANTIATE_BLAS(TYPE) \ + template Array matmul(const Array &lhs, const Array &rhs, \ + af_mat_prop optLhs, af_mat_prop optRhs); + +INSTANTIATE_BLAS(float) +INSTANTIATE_BLAS(cfloat) +INSTANTIATE_BLAS(double) +INSTANTIATE_BLAS(cdouble) + +//#define INSTANTIATE_DOT(TYPE) \ +// template Array dot(const Array &lhs, const Array &rhs, \ +// af_mat_prop optLhs, af_mat_prop optRhs); +// +//INSTANTIATE_DOT(float) +//INSTANTIATE_DOT(double) +//INSTANTIATE_DOT(cfloat) +//INSTANTIATE_DOT(cdouble) + +} +} diff --git a/src/backend/opencl/cpu/cpu_blas.hpp b/src/backend/opencl/cpu/cpu_blas.hpp new file mode 100644 index 0000000000..303b60ced8 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_blas.hpp @@ -0,0 +1,23 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + Array matmul(const Array &lhs, const Array &rhs, + af_mat_prop optLhs, af_mat_prop optRhs); +// template +// Array dot(const Array &lhs, const Array &rhs, +// af_mat_prop optLhs, af_mat_prop optRhs); +} +} diff --git a/src/backend/opencl/cpu/cpu_helper.hpp b/src/backend/opencl/cpu/cpu_helper.hpp new file mode 100644 index 0000000000..afc60d3b9f --- /dev/null +++ b/src/backend/opencl/cpu/cpu_helper.hpp @@ -0,0 +1,46 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include + +#ifdef __APPLE__ +#include +#else +#ifdef USE_MKL +#include +#else +extern "C" { +#include +} +#endif +#endif + +// TODO: Ask upstream for a more official way to detect it +#ifdef OPENBLAS_CONST +#define IS_OPENBLAS +#endif + +// Make sure we get the correct type signature for OpenBLAS +// OpenBLAS defines blasint as it's index type. Emulate this +// if we're not dealing with openblas and use it where applicable +#ifndef IS_OPENBLAS +typedef int blasint; +#endif + +namespace opencl +{ +namespace cpu +{ +} +} + diff --git a/test/blas.cpp b/test/blas.cpp index 507cc6dc7b..b5d92f1073 100644 --- a/test/blas.cpp +++ b/test/blas.cpp @@ -36,6 +36,7 @@ template void MatMulCheck(string TestFile) { if (noDoubleTests()) return; + af::info(); using std::vector; vector numDims; From f9819f78c191aea10331e42ec361836f2a1e3c57 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 7 Jan 2016 16:53:55 -0500 Subject: [PATCH 144/288] Fix blas header types in cpu --- src/backend/cpu/blas.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/backend/cpu/blas.hpp b/src/backend/cpu/blas.hpp index 934a2c6ec7..05484338cd 100644 --- a/src/backend/cpu/blas.hpp +++ b/src/backend/cpu/blas.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef __APPLE__ #include @@ -45,10 +46,4 @@ template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs); -typedef std::complex cfloat; -typedef std::complex cdouble; - -template struct is_complex { static const bool value = false; }; -template<> struct is_complex { static const bool value = true; }; -template<> struct is_complex { static const bool value = true; }; } From d5077ecfdf8d04077ed356d793f84faacab1d929 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 7 Jan 2016 18:14:14 -0500 Subject: [PATCH 145/288] Fix bug in OpenCL JIT when calling functions that return same value * Such as calling conj on float --- src/backend/opencl/binary.hpp | 2 +- src/backend/opencl/kernel/jit.cl | 1 + src/backend/opencl/unary.hpp | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/opencl/binary.hpp b/src/backend/opencl/binary.hpp index 4f58cb49e6..11493a5966 100644 --- a/src/backend/opencl/binary.hpp +++ b/src/backend/opencl/binary.hpp @@ -22,7 +22,7 @@ namespace opencl { const char *name() { - return "noop"; + return "__invalid"; } }; diff --git a/src/backend/opencl/kernel/jit.cl b/src/backend/opencl/kernel/jit.cl index b34bbcddd8..3092449418 100644 --- a/src/backend/opencl/kernel/jit.cl +++ b/src/backend/opencl/kernel/jit.cl @@ -8,6 +8,7 @@ ********************************************************/ #define sign(in) signbit((in)) +#define __noop(a) (a) #define __add(lhs, rhs) (lhs) + (rhs) #define __sub(lhs, rhs) (lhs) - (rhs) #define __mul(lhs, rhs) (lhs) * (rhs) diff --git a/src/backend/opencl/unary.hpp b/src/backend/opencl/unary.hpp index 5a2cc9e33f..1e363d7dcb 100644 --- a/src/backend/opencl/unary.hpp +++ b/src/backend/opencl/unary.hpp @@ -16,7 +16,7 @@ namespace opencl { template -static const char *unaryName() { return "noop"; } +static const char *unaryName() { return "__noop"; } #define UNARY_DECL(OP, FNAME) \ template<> STATIC_ \ From ac25f5bb19c0f0db90d47576f29bfa81f6e060d6 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 7 Jan 2016 18:15:41 -0500 Subject: [PATCH 146/288] Fix bug in CUDA JIT when calling functions that return same value * Such as calling conj on float --- src/backend/cuda/JIT/numeric.cu | 13 +++++++++++++ src/backend/cuda/complex.hpp | 22 +++++++++++----------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/backend/cuda/JIT/numeric.cu b/src/backend/cuda/JIT/numeric.cu index 8253db6d22..2bcb15a112 100644 --- a/src/backend/cuda/JIT/numeric.cu +++ b/src/backend/cuda/JIT/numeric.cu @@ -119,6 +119,19 @@ MATH_CAST(lgamma, intl , float) MATH_CAST(lgamma, ushort, float) MATH_CAST(lgamma, short , float) +MATH_NOOP(noop, float) +MATH_NOOP(noop, double) +MATH_NOOP(noop, cfloat) +MATH_NOOP(noop, cdouble) +MATH_NOOP(noop, int) +MATH_NOOP(noop, uint) +MATH_NOOP(noop, char) +MATH_NOOP(noop, uchar) +MATH_NOOP(noop, uintl) +MATH_NOOP(noop, intl) +MATH_NOOP(noop, ushort) +MATH_NOOP(noop, short) + __device__ float ___abs(cfloat a) { return cuCabsf(a); } __device__ double ___abs(cdouble a) { return cuCabs(a); } diff --git a/src/backend/cuda/complex.hpp b/src/backend/cuda/complex.hpp index 82304b9a22..b7de74a7de 100644 --- a/src/backend/cuda/complex.hpp +++ b/src/backend/cuda/complex.hpp @@ -17,25 +17,25 @@ namespace cuda { - template static const std::string cplx_name() { return "@___noop"; } - template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } - template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } + template static const std::string cplx_name() { return cuMangledName("___noop"); } + template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } + template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } - template static const std::string real_name() { return "@___noop"; } + template static const std::string real_name() { return cuMangledName("___noop"); } template<> STATIC_ const std::string real_name() { return cuMangledName("___real"); } template<> STATIC_ const std::string real_name() { return cuMangledName("___real"); } - template static const std::string imag_name() { return "@___noop"; } + template static const std::string imag_name() { return cuMangledName("___noop"); } template<> STATIC_ const std::string imag_name() { return cuMangledName("___imag"); } template<> STATIC_ const std::string imag_name() { return cuMangledName("___imag"); } - template static const std::string abs_name() { return "@___noop"; } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } + template static const std::string abs_name() { return cuMangledName("___noop"); } + template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } + template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } + template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } + template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template static const std::string conj_name() { return "@___noop"; } + template static const std::string conj_name() { return cuMangledName("___noop"); } template<> STATIC_ const std::string conj_name() { return cuMangledName("___conj"); } template<> STATIC_ const std::string conj_name() { return cuMangledName("___conj"); } From 507ec929888bf137db79648778701db7b1ca5532 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 7 Jan 2016 18:17:06 -0500 Subject: [PATCH 147/288] dot in CUDA/OpenCL now uses mul followed by reduction --- src/backend/cuda/blas.cpp | 58 +++++++++++++++---------------- src/backend/opencl/blas.cpp | 68 ++++++++++++++++++------------------- 2 files changed, 63 insertions(+), 63 deletions(-) diff --git a/src/backend/cuda/blas.cpp b/src/backend/cuda/blas.cpp index 85f48da750..1e5dd5de39 100644 --- a/src/backend/cuda/blas.cpp +++ b/src/backend/cuda/blas.cpp @@ -18,6 +18,9 @@ #include #include #include +#include +#include +#include namespace cuda { @@ -197,40 +200,37 @@ Array matmul(const Array &lhs, const Array &rhs, } -template -Array dot_(const Array &lhs, const Array &rhs, - af_mat_prop optLhs, af_mat_prop optRhs) -{ - int N = lhs.dims()[0]; - - T out; - - CUBLAS_CHECK((dot_func()( - getHandle(), - N, - lhs.get(), lhs.strides()[0], - rhs.get(), rhs.strides()[0], - &out))); - - if(both_conjugate) - return createValueArray(af::dim4(1), conj(out)); - else - return createValueArray(af::dim4(1), out); -} +// Keeping this around for future reference +//template +//Array dot_(const Array &lhs, const Array &rhs, +// af_mat_prop optLhs, af_mat_prop optRhs) +//{ +// int N = lhs.dims()[0]; +// +// T out; +// +// CUBLAS_CHECK((dot_func()( +// getHandle(), +// N, +// lhs.get(), lhs.strides()[0], +// rhs.get(), rhs.strides()[0], +// &out))); +// +// if(both_conjugate) +// return createValueArray(af::dim4(1), conj(out)); +// else +// return createValueArray(af::dim4(1), out); +//} template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { - if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - return dot_(lhs, rhs, optLhs, optRhs); - } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - return dot_(lhs, rhs, optLhs, optRhs); - } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - return dot_(rhs, lhs, optRhs, optLhs); - } else { - return dot_(lhs, rhs, optLhs, optRhs); - } + const Array lhs_ = (optLhs == AF_MAT_NONE ? lhs : conj(lhs)); + const Array rhs_ = (optRhs == AF_MAT_NONE ? rhs : conj(rhs)); + + const Array temp = arithOp(lhs_, rhs_, lhs_.dims()); + return reduce(temp, 0, false, 0); } template diff --git a/src/backend/opencl/blas.cpp b/src/backend/opencl/blas.cpp index f9f8af1253..15e2373783 100644 --- a/src/backend/opencl/blas.cpp +++ b/src/backend/opencl/blas.cpp @@ -19,6 +19,9 @@ #include #include #include +#include +#include +#include #include @@ -174,45 +177,42 @@ Array matmul(const Array &lhs, const Array &rhs, return out; } -template -Array dot_(const Array &lhs, const Array &rhs, - af_mat_prop optLhs, af_mat_prop optRhs) -{ - initBlas(); - - int N = lhs.dims()[0]; - dot_func dot; - cl::Event event; - Array out = createEmptyArray(af::dim4(1)); - cl::Buffer scratch(getContext(), CL_MEM_READ_WRITE, sizeof(T) * N); - CLBLAS_CHECK( - dot(N, - (*out.get())(), out.getOffset(), - (*lhs.get())(), lhs.getOffset(), lhs.strides()[0], - (*rhs.get())(), rhs.getOffset(), rhs.strides()[0], - scratch(), - 1, &getQueue()(), 0, nullptr, &event()) - ); - - if(both_conjugate) - transpose_inplace(out, true); - - return out; -} +// Keeping this around for future reference +//template +//Array dot_(const Array &lhs, const Array &rhs, +// af_mat_prop optLhs, af_mat_prop optRhs) +//{ +// initBlas(); +// +// int N = lhs.dims()[0]; +// dot_func dot; +// cl::Event event; +// Array out = createEmptyArray(af::dim4(1)); +// cl::Buffer scratch(getContext(), CL_MEM_READ_WRITE, sizeof(T) * N); +// CLBLAS_CHECK( +// dot(N, +// (*out.get())(), out.getOffset(), +// (*lhs.get())(), lhs.getOffset(), lhs.strides()[0], +// (*rhs.get())(), rhs.getOffset(), rhs.strides()[0], +// scratch(), +// 1, &getQueue()(), 0, nullptr, &event()) +// ); +// +// if(both_conjugate) +// transpose_inplace(out, true); +// +// return out; +//} template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { - if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { - return dot_(lhs, rhs, optLhs, optRhs); - } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { - return dot_(lhs, rhs, optLhs, optRhs); - } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { - return dot_(rhs, lhs, optRhs, optLhs); - } else { - return dot_(lhs, rhs, optLhs, optRhs); - } + const Array lhs_ = (optLhs == AF_MAT_NONE ? lhs : conj(lhs)); + const Array rhs_ = (optRhs == AF_MAT_NONE ? rhs : conj(rhs)); + + const Array temp = arithOp(lhs_, rhs_, lhs_.dims()); + return reduce(temp, 0, false, 0); } #define INSTANTIATE_BLAS(TYPE) \ From 5940d4bc93a5f644ffb51d699d6effa2564418c1 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Fri, 8 Jan 2016 08:43:51 -0500 Subject: [PATCH 148/288] Always use freeimage flags instead of hardcoded offsets --- src/api/c/imageio.cpp | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp index 746ee69142..c6a20a85a2 100644 --- a/src/api/c/imageio.cpp +++ b/src/api/c/imageio.cpp @@ -63,9 +63,9 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc - pDst0[indx] = (float) *(src + (x * step + 0)); - pDst1[indx] = (float) *(src + (x * step + 1)); - pDst2[indx] = (float) *(src + (x * step + 2)); + pDst0[indx] = (float) *(src + (x * step + FI_RGBA_RED)); + pDst1[indx] = (float) *(src + (x * step + FI_RGBA_GREEN)); + pDst2[indx] = (float) *(src + (x * step + FI_RGBA_BLUE)); } if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + FI_RGBA_ALPHA)); } @@ -104,9 +104,9 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc - r = (T) *(src + (x * step + 0)); - g = (T) *(src + (x * step + 1)); - b = (T) *(src + (x * step + 2)); + r = (T) *(src + (x * step + FI_RGBA_RED)); + g = (T) *(src + (x * step + FI_RGBA_GREEN)); + b = (T) *(src + (x * step + FI_RGBA_BLUE)); } pDst[indx] = r * 0.2989f + g * 0.5870f + b * 0.1140f; } @@ -333,10 +333,10 @@ af_err af_save_image(const char* filename, const af_array in_) // Copy the array into FreeImage buffer for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { - *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r - *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g - *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b - *(pDstLine + x * step + 3) = (uchar) pSrc3[indx]; // a + *(pDstLine + x * step + FI_RGBA_RED ) = (uchar) pSrc0[indx]; // r + *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g + *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b + *(pDstLine + x * step + FI_RGBA_ALPHA) = (uchar) pSrc3[indx]; // a ++indx; } pDstLine -= nDstPitch; @@ -362,9 +362,9 @@ af_err af_save_image(const char* filename, const af_array in_) // Copy the array into FreeImage buffer for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { - *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r - *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g - *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b + *(pDstLine + x * step + FI_RGBA_RED ) = (uchar) pSrc0[indx]; // r + *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g + *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b ++indx; } pDstLine -= nDstPitch; @@ -602,10 +602,10 @@ af_err af_save_image_memory(void **ptr, const af_array in_, const af_image_forma // Copy the array into FreeImage buffer for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { - *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b - *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g - *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r - *(pDstLine + x * step + 3) = (uchar) pSrc3[indx]; // a + *(pDstLine + x * step + FI_RGBA_RED ) = (uchar) pSrc0[indx]; // r + *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g + *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b + *(pDstLine + x * step + FI_RGBA_ALPHA) = (uchar) pSrc3[indx]; // a ++indx; } pDstLine -= nDstPitch; @@ -631,9 +631,9 @@ af_err af_save_image_memory(void **ptr, const af_array in_, const af_image_forma // Copy the array into FreeImage buffer for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { - *(pDstLine + x * step + 2) = (uchar) pSrc0[indx]; // b - *(pDstLine + x * step + 1) = (uchar) pSrc1[indx]; // g - *(pDstLine + x * step + 0) = (uchar) pSrc2[indx]; // r + *(pDstLine + x * step + FI_RGBA_RED ) = (uchar) pSrc0[indx]; // r + *(pDstLine + x * step + FI_RGBA_GREEN) = (uchar) pSrc1[indx]; // g + *(pDstLine + x * step + FI_RGBA_BLUE ) = (uchar) pSrc2[indx]; // b ++indx; } pDstLine -= nDstPitch; From 7eafd44ef20ac89b3f0744aad6e58d155bd48b13 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 10:21:28 -0500 Subject: [PATCH 149/288] Remove cpu dot fallback. Slower than opencl dot always --- src/backend/opencl/cpu/cpu_blas.cpp | 53 ----------------------------- src/backend/opencl/cpu/cpu_blas.hpp | 3 -- 2 files changed, 56 deletions(-) diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp index 524777a6c6..ff7170d331 100644 --- a/src/backend/opencl/cpu/cpu_blas.cpp +++ b/src/backend/opencl/cpu/cpu_blas.cpp @@ -13,7 +13,6 @@ #include #include #include -#include namespace opencl { @@ -169,7 +168,6 @@ Array matmul(const Array &lhs, const Array &rhs, dim4 lStrides = lhs.strides(); dim4 rStrides = rhs.strides(); using BT = typename blas_base::type; - using CBT = const typename blas_base::type; // get host pointers from mapped memory BT *lPtr = getMappedPtr(lhs.get()); @@ -204,48 +202,6 @@ Array matmul(const Array &lhs, const Array &rhs, return out; } -//template T -//conj(T x) { return x; } -// -//template<> cfloat conj (cfloat c) { return std::conj(c); } -//template<> cdouble conj(cdouble c) { return std::conj(c); } -// -//template -//Array dot_(const Array &lhs, const Array &rhs, -// af_mat_prop optLhs, af_mat_prop optRhs) -//{ -// int N = lhs.dims()[0]; -// -// T out = 0; -// const T *pL = lhs.get(); -// const T *pR = rhs.get(); -// -// for(int i = 0; i < N; i++) -// out += (conjugate ? cpu::conj(pL[i]) : pL[i]) * pR[i]; -// -// if(both_conjugate) out = cpu::conj(out); -// -// return createValueArray(af::dim4(1), out); -//} -// -//template -//Array dot(const Array &lhs, const Array &rhs, -// af_mat_prop optLhs, af_mat_prop optRhs) -//{ -// if(optLhs == AF_MAT_CONJ && optRhs == AF_MAT_CONJ) { -// return dot_(lhs, rhs, optLhs, optRhs); -// } else if (optLhs == AF_MAT_CONJ && optRhs == AF_MAT_NONE) { -// return dot_(lhs, rhs, optLhs, optRhs); -// } else if (optLhs == AF_MAT_NONE && optRhs == AF_MAT_CONJ) { -// return dot_(rhs, lhs, optRhs, optLhs); -// } else { -// return dot_(lhs, rhs, optLhs, optRhs); -// } -//} - -#undef BT -#undef REINTEPRET_CAST - #define INSTANTIATE_BLAS(TYPE) \ template Array matmul(const Array &lhs, const Array &rhs, \ af_mat_prop optLhs, af_mat_prop optRhs); @@ -255,14 +211,5 @@ INSTANTIATE_BLAS(cfloat) INSTANTIATE_BLAS(double) INSTANTIATE_BLAS(cdouble) -//#define INSTANTIATE_DOT(TYPE) \ -// template Array dot(const Array &lhs, const Array &rhs, \ -// af_mat_prop optLhs, af_mat_prop optRhs); -// -//INSTANTIATE_DOT(float) -//INSTANTIATE_DOT(double) -//INSTANTIATE_DOT(cfloat) -//INSTANTIATE_DOT(cdouble) - } } diff --git a/src/backend/opencl/cpu/cpu_blas.hpp b/src/backend/opencl/cpu/cpu_blas.hpp index 303b60ced8..836d6e02de 100644 --- a/src/backend/opencl/cpu/cpu_blas.hpp +++ b/src/backend/opencl/cpu/cpu_blas.hpp @@ -16,8 +16,5 @@ namespace cpu template Array matmul(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs); -// template -// Array dot(const Array &lhs, const Array &rhs, -// af_mat_prop optLhs, af_mat_prop optRhs); } } From 45abbc35741f5e04a6c9655b30bd5dc3f7b47b46 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 11:02:11 -0500 Subject: [PATCH 150/288] Add OpenCL-CPU fallback for LU --- src/backend/opencl/cpu/cpu_lapack_helper.hpp | 35 ++++ src/backend/opencl/cpu/cpu_lu.cpp | 178 +++++++++++++++++++ src/backend/opencl/cpu/cpu_lu.hpp | 22 +++ src/backend/opencl/lu.cpp | 11 +- 4 files changed, 245 insertions(+), 1 deletion(-) create mode 100644 src/backend/opencl/cpu/cpu_lapack_helper.hpp create mode 100644 src/backend/opencl/cpu/cpu_lu.cpp create mode 100644 src/backend/opencl/cpu/cpu_lu.hpp diff --git a/src/backend/opencl/cpu/cpu_lapack_helper.hpp b/src/backend/opencl/cpu/cpu_lapack_helper.hpp new file mode 100644 index 0000000000..174022e772 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_lapack_helper.hpp @@ -0,0 +1,35 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#ifndef AFCPU_LAPACK +#define AFCPU_LAPACK + +#include + +#define lapack_complex_float opencl::cfloat +#define lapack_complex_double opencl::cdouble +#define LAPACK_PREFIX LAPACKE_ +#define ORDER_TYPE int +#define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR +#define LAPACK_NAME(fn) LAPACKE_##fn + +#ifdef __APPLE__ +#include +#include +#undef AF_LAPACK_COL_MAJOR +#define AF_LAPACK_COL_MAJOR 0 +#else +#ifdef USE_MKL +#include +#else // NETLIB LAPACKE +#include +#endif +#endif + +#endif diff --git a/src/backend/opencl/cpu/cpu_lu.cpp b/src/backend/opencl/cpu/cpu_lu.cpp new file mode 100644 index 0000000000..f415cb3983 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_lu.cpp @@ -0,0 +1,178 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace opencl +{ +namespace cpu +{ + +template +using getrf_func_def = int (*)(ORDER_TYPE, int, int, + T*, int, + int*); + +#define LU_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + + +#define LU_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +LU_FUNC_DEF( getrf ) +LU_FUNC(getrf , float , s) +LU_FUNC(getrf , double , d) +LU_FUNC(getrf , cfloat , c) +LU_FUNC(getrf , cdouble, z) + +template +void lu_split(Array &lower, Array &upper, const Array &in) +{ + T *l = getMappedPtr(lower.get()); + T *u = getMappedPtr(upper.get()); + T *i = getMappedPtr(in.get()); + + dim4 ldm = lower.dims(); + dim4 udm = upper.dims(); + dim4 idm = in.dims(); + + dim4 lst = lower.strides(); + dim4 ust = upper.strides(); + dim4 ist = in.strides(); + + for(dim_t ow = 0; ow < idm[3]; ow++) { + const dim_t lW = ow * lst[3]; + const dim_t uW = ow * ust[3]; + const dim_t iW = ow * ist[3]; + + for(dim_t oz = 0; oz < idm[2]; oz++) { + const dim_t lZW = lW + oz * lst[2]; + const dim_t uZW = uW + oz * ust[2]; + const dim_t iZW = iW + oz * ist[2]; + + for(dim_t oy = 0; oy < idm[1]; oy++) { + const dim_t lYZW = lZW + oy * lst[1]; + const dim_t uYZW = uZW + oy * ust[1]; + const dim_t iYZW = iZW + oy * ist[1]; + + for(dim_t ox = 0; ox < idm[0]; ox++) { + const dim_t lMem = lYZW + ox; + const dim_t uMem = uYZW + ox; + const dim_t iMem = iYZW + ox; + if(ox > oy) { + if(oy < ldm[1]) + l[lMem] = i[iMem]; + if(ox < udm[0]) + u[uMem] = scalar(0); + } else if (oy > ox) { + if(oy < ldm[1]) + l[lMem] = scalar(0); + if(ox < udm[0]) + u[uMem] = i[iMem]; + } else if(ox == oy) { + if(oy < ldm[1]) + l[lMem] = scalar(1.0); + if(ox < udm[0]) + u[uMem] = i[iMem]; + } + } + } + } + } + + unmapPtr(lower.get(), l); + unmapPtr(upper.get(), u); + unmapPtr(in.get(), i); +} + +void convertPivot(Array &pivot, int out_sz) +{ + Array p = range(dim4(out_sz), 0); // Runs opencl + + int *d_pi = getMappedPtr(pivot.get()); + int *d_po = getMappedPtr(p.get()); + + dim_t d0 = pivot.dims()[0]; + + for(int j = 0; j < (int)d0; j++) { + // 1 indexed in pivot + std::swap(d_po[j], d_po[d_pi[j] - 1]); + } + + unmapPtr(pivot.get(), d_pi); + unmapPtr(p.get(), d_po); + + pivot = p; +} + +template +void lu(Array &lower, Array &upper, Array &pivot, const Array &in) +{ + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + Array in_copy = copyArray(in); + pivot = lu_inplace(in_copy); + + // SPLIT into lower and upper + dim4 ldims(M, min(M, N)); + dim4 udims(min(M, N), N); + lower = createEmptyArray(ldims); + upper = createEmptyArray(udims); + + lu_split(lower, upper, in_copy); +} + +template +Array lu_inplace(Array &in, const bool convert_pivot) +{ + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + Array pivot = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); + + T *inPtr = getMappedPtr(in.get()); + int *pivotPtr = getMappedPtr(pivot.get()); + + getrf_func()(AF_LAPACK_COL_MAJOR, M, N, + inPtr, in.strides()[1], + pivotPtr); + + unmapPtr(in.get(), inPtr); + unmapPtr(pivot.get(), pivotPtr); + + if(convert_pivot) convertPivot(pivot, M); + + return pivot; +} + +#define INSTANTIATE_LU(T) \ + template Array lu_inplace(Array &in, const bool convert_pivot); \ + template void lu(Array &lower, Array &upper, Array &pivot, const Array &in); + +INSTANTIATE_LU(float) +INSTANTIATE_LU(cfloat) +INSTANTIATE_LU(double) +INSTANTIATE_LU(cdouble) + +} +} diff --git a/src/backend/opencl/cpu/cpu_lu.hpp b/src/backend/opencl/cpu/cpu_lu.hpp new file mode 100644 index 0000000000..6c038f20c7 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_lu.hpp @@ -0,0 +1,22 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + void lu(Array &lower, Array &upper, Array &pivot, const Array &in); + + template + Array lu_inplace(Array &in, const bool convert_pivot = true); +} +} diff --git a/src/backend/opencl/lu.cpp b/src/backend/opencl/lu.cpp index 2d94d4d326..0bc6bd5283 100644 --- a/src/backend/opencl/lu.cpp +++ b/src/backend/opencl/lu.cpp @@ -14,7 +14,9 @@ #include #include #include +#include #include +#include namespace opencl { @@ -41,8 +43,11 @@ Array convertPivot(int *ipiv, int in_sz, int out_sz) template void lu(Array &lower, Array &upper, Array &pivot, const Array &in) { - try { + if(OpenCLCPUOffload()) { + return cpu::lu(lower, upper, pivot, in); + } + dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; @@ -67,6 +72,10 @@ template Array lu_inplace(Array &in, const bool convert_pivot) { try { + if(OpenCLCPUOffload()) { + return cpu::lu_inplace(in, convert_pivot); + } + initBlas(); dim4 iDims = in.dims(); int M = iDims[0]; From 88e910d9a9e91dc38b888fabd70a3a307e79307c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 11:21:01 -0500 Subject: [PATCH 151/288] Add OpenCL-CPU fallback for Cholesky --- src/backend/opencl/cholesky.cpp | 13 +++- src/backend/opencl/cpu/cpu_cholesky.cpp | 88 +++++++++++++++++++++++++ src/backend/opencl/cpu/cpu_cholesky.hpp | 22 +++++++ src/backend/opencl/cpu/cpu_triangle.hpp | 52 +++++++++++++++ 4 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 src/backend/opencl/cpu/cpu_cholesky.cpp create mode 100644 src/backend/opencl/cpu/cpu_cholesky.hpp create mode 100644 src/backend/opencl/cpu/cpu_triangle.hpp diff --git a/src/backend/opencl/cholesky.cpp b/src/backend/opencl/cholesky.cpp index 78fe999645..a2034a331a 100644 --- a/src/backend/opencl/cholesky.cpp +++ b/src/backend/opencl/cholesky.cpp @@ -8,14 +8,16 @@ ********************************************************/ #include -#include #include -#include #include +#include +#include #if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include +#include +#include namespace opencl { @@ -24,6 +26,10 @@ template int cholesky_inplace(Array &in, const bool is_upper) { try { + if(OpenCLCPUOffload()) { + return cpu::cholesky_inplace(in, is_upper); + } + initBlas(); dim4 iDims = in.dims(); @@ -46,6 +52,9 @@ template Array cholesky(int *info, const Array &in, const bool is_upper) { try { + if(OpenCLCPUOffload()) { + return cpu::cholesky(info, in, is_upper); + } Array out = copyArray(in); *info = cholesky_inplace(out, is_upper); diff --git a/src/backend/opencl/cpu/cpu_cholesky.cpp b/src/backend/opencl/cpu/cpu_cholesky.cpp new file mode 100644 index 0000000000..234df2b242 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_cholesky.cpp @@ -0,0 +1,88 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace opencl +{ +namespace cpu +{ + +template +using potrf_func_def = int (*)(ORDER_TYPE, char, + int, + T*, int); + +#define CH_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + + +#define CH_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +CH_FUNC_DEF( potrf ) +CH_FUNC(potrf , float , s) +CH_FUNC(potrf , double , d) +CH_FUNC(potrf , cfloat , c) +CH_FUNC(potrf , cdouble, z) + +template +Array cholesky(int *info, const Array &in, const bool is_upper) +{ + Array out = copyArray(in); + *info = cholesky_inplace(out, is_upper); + + T* oPtr = getMappedPtr(out.get()); + if (is_upper) triangle(oPtr, oPtr, out.dims(), out.strides(), out.strides()); + else triangle(oPtr, oPtr, out.dims(), out.strides(), out.strides()); + unmapPtr(out.get(), oPtr); + + return out; +} + +template +int cholesky_inplace(Array &in, const bool is_upper) +{ + dim4 iDims = in.dims(); + int N = iDims[0]; + + char uplo = 'L'; + if(is_upper) + uplo = 'U'; + + T* inPtr = getMappedPtr(in.get()); + int info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, + N, inPtr, in.strides()[1]); + unmapPtr(in.get(), inPtr); + + return info; +} + +#define INSTANTIATE_CH(T) \ + template int cholesky_inplace(Array &in, const bool is_upper); \ + template Array cholesky (int *info, const Array &in, const bool is_upper); \ + + +INSTANTIATE_CH(float) +INSTANTIATE_CH(cfloat) +INSTANTIATE_CH(double) +INSTANTIATE_CH(cdouble) + +} +} diff --git a/src/backend/opencl/cpu/cpu_cholesky.hpp b/src/backend/opencl/cpu/cpu_cholesky.hpp new file mode 100644 index 0000000000..041e93980e --- /dev/null +++ b/src/backend/opencl/cpu/cpu_cholesky.hpp @@ -0,0 +1,22 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + Array cholesky(int *info, const Array &in, const bool is_upper); + + template + int cholesky_inplace(Array &in, const bool is_upper); +} +} diff --git a/src/backend/opencl/cpu/cpu_triangle.hpp b/src/backend/opencl/cpu/cpu_triangle.hpp new file mode 100644 index 0000000000..5e40f929b9 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_triangle.hpp @@ -0,0 +1,52 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#ifndef CPU_LAPACK_TRIANGLE +#define CPU_LAPACK_TRIANGLE +namespace opencl +{ +namespace cpu +{ + +template +void triangle(T *o, const T *i, const dim4 odm, const dim4 ost, const dim4 ist) +{ + for(dim_t ow = 0; ow < odm[3]; ow++) { + const dim_t oW = ow * ost[3]; + const dim_t iW = ow * ist[3]; + + for(dim_t oz = 0; oz < odm[2]; oz++) { + const dim_t oZW = oW + oz * ost[2]; + const dim_t iZW = iW + oz * ist[2]; + + for(dim_t oy = 0; oy < odm[1]; oy++) { + const dim_t oYZW = oZW + oy * ost[1]; + const dim_t iYZW = iZW + oy * ist[1]; + + for(dim_t ox = 0; ox < odm[0]; ox++) { + const dim_t oMem = oYZW + ox; + const dim_t iMem = iYZW + ox; + + bool cond = is_upper ? (oy >= ox) : (oy <= ox); + bool do_unit_diag = (is_unit_diag && ox == oy); + if(cond) { + o[oMem] = do_unit_diag ? scalar(1) : i[iMem]; + } else { + o[oMem] = scalar(0); + } + } + } + } + } +} + +} +} + +#endif From 872acfb2c15b7e44014c42f38df3d5843abee860 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 12:27:47 -0500 Subject: [PATCH 152/288] Add OpenCL-CPU fallback for QR --- src/backend/opencl/cpu/cpu_qr.cpp | 130 ++++++++++++++++++++++++++++++ src/backend/opencl/cpu/cpu_qr.hpp | 22 +++++ src/backend/opencl/qr.cpp | 19 ++++- 3 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 src/backend/opencl/cpu/cpu_qr.cpp create mode 100644 src/backend/opencl/cpu/cpu_qr.hpp diff --git a/src/backend/opencl/cpu/cpu_qr.cpp b/src/backend/opencl/cpu/cpu_qr.cpp new file mode 100644 index 0000000000..080ebb6b69 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_qr.cpp @@ -0,0 +1,130 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace opencl +{ +namespace cpu +{ + +template +using geqrf_func_def = int (*)(ORDER_TYPE, int, int, + T*, int, + T*); + +template +using gqr_func_def = int (*)(ORDER_TYPE, int, int, int, + T*, int, + const T*); + +#define QR_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + + +#define QR_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +QR_FUNC_DEF( geqrf ) +QR_FUNC(geqrf , float , s) +QR_FUNC(geqrf , double , d) +QR_FUNC(geqrf , cfloat , c) +QR_FUNC(geqrf , cdouble, z) + +#define GQR_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + +#define GQR_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX); } + +GQR_FUNC_DEF( gqr ) +GQR_FUNC(gqr , float , sorgqr) +GQR_FUNC(gqr , double , dorgqr) +GQR_FUNC(gqr , cfloat , cungqr) +GQR_FUNC(gqr , cdouble, zungqr) + +template +void qr(Array &q, Array &r, Array &t, const Array &in) +{ + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + dim4 padDims(M, max(M, N)); + q = padArray(in, padDims, scalar(0)); + q.resetDims(iDims); + t = qr_inplace(q); + + // SPLIT into q and r + dim4 rdims(M, N); + r = createEmptyArray(rdims); + + T *qPtr = getMappedPtr(q.get()); + T *rPtr = getMappedPtr(r.get()); + T *tPtr = getMappedPtr(t.get()); + + triangle(rPtr, qPtr, rdims, r.strides(), q.strides()); + + gqr_func()(AF_LAPACK_COL_MAJOR, + M, M, min(M, N), + qPtr, q.strides()[1], + tPtr); + + unmapPtr(q.get(), qPtr); + unmapPtr(r.get(), rPtr); + unmapPtr(t.get(), tPtr); + + q.resetDims(dim4(M, M)); +} + +template +Array qr_inplace(Array &in) +{ + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + Array t = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); + + T *iPtr = getMappedPtr(in.get()); + T *tPtr = getMappedPtr(t.get()); + + geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, + iPtr, in.strides()[1], + tPtr); + + unmapPtr(in.get(), iPtr); + unmapPtr(t.get(), tPtr); + + return t; +} + +#define INSTANTIATE_QR(T) \ + template Array qr_inplace(Array &in); \ + template void qr(Array &q, Array &r, Array &t, const Array &in); + +INSTANTIATE_QR(float) +INSTANTIATE_QR(cfloat) +INSTANTIATE_QR(double) +INSTANTIATE_QR(cdouble) + +} +} diff --git a/src/backend/opencl/cpu/cpu_qr.hpp b/src/backend/opencl/cpu/cpu_qr.hpp new file mode 100644 index 0000000000..c499b9d03b --- /dev/null +++ b/src/backend/opencl/cpu/cpu_qr.hpp @@ -0,0 +1,22 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + void qr(Array &q, Array &r, Array &t, const Array &in); + + template + Array qr_inplace(Array &in); +} +} diff --git a/src/backend/opencl/qr.cpp b/src/backend/opencl/qr.cpp index 9e30b43435..56101a8b97 100644 --- a/src/backend/opencl/qr.cpp +++ b/src/backend/opencl/qr.cpp @@ -9,16 +9,19 @@ #include #include +#include #include #include -#include -#include + +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) + #include #include #include #include - -#if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include +#include namespace opencl { @@ -27,6 +30,10 @@ template void qr(Array &q, Array &r, Array &t, const Array &orig) { try { + if(OpenCLCPUOffload()) { + return cpu::qr(q, r, t, orig); + } + initBlas(); dim4 iDims = orig.dims(); int M = iDims[0]; @@ -81,6 +88,10 @@ template Array qr_inplace(Array &in) { try { + if(OpenCLCPUOffload()) { + return cpu::qr_inplace(in); + } + initBlas(); dim4 iDims = in.dims(); int M = iDims[0]; From 59a9df0957537e4933c145482034704d68915c1b Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 12:48:30 -0500 Subject: [PATCH 153/288] Add OpenCL-CPU fallback for SVD --- src/backend/opencl/cpu/cpu_svd.cpp | 103 +++++++++++++++++++++++++++++ src/backend/opencl/cpu/cpu_svd.hpp | 22 ++++++ src/backend/opencl/svd.cpp | 10 +++ 3 files changed, 135 insertions(+) create mode 100644 src/backend/opencl/cpu/cpu_svd.cpp create mode 100644 src/backend/opencl/cpu/cpu_svd.hpp diff --git a/src/backend/opencl/cpu/cpu_svd.cpp b/src/backend/opencl/cpu/cpu_svd.cpp new file mode 100644 index 0000000000..85b9ee8280 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_svd.cpp @@ -0,0 +1,103 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include + +#include + +namespace opencl +{ +namespace cpu +{ + +#define SVD_FUNC_DEF( FUNC ) \ + template svd_func_def svd_func(); + +#define SVD_FUNC( FUNC, T, Tr, PREFIX ) \ + template<> svd_func_def svd_func() \ + { return & LAPACK_NAME(PREFIX##FUNC); } + +#if defined(USE_MKL) || defined(__APPLE__) + + template + using svd_func_def = int (*)(ORDER_TYPE, + char jobz, + int m, int n, + T* in, int ldin, + Tr* s, + T* u, int ldu, + T* vt, int ldvt); + + SVD_FUNC_DEF( gesdd ) + SVD_FUNC(gesdd, float , float , s) + SVD_FUNC(gesdd, double , double, d) + SVD_FUNC(gesdd, cfloat , float , c) + SVD_FUNC(gesdd, cdouble, double, z) + +#else // Atlas causes memory freeing issues with using gesdd + + template + using svd_func_def = int (*)(ORDER_TYPE, + char jobu, char jobvt, + int m, int n, + T* in, int ldin, + Tr* s, + T* u, int ldu, + T* vt, int ldvt, + Tr *superb); + + SVD_FUNC_DEF( gesvd ) + SVD_FUNC(gesvd, float , float , s) + SVD_FUNC(gesvd, double , double, d) + SVD_FUNC(gesvd, cfloat , float , c) + SVD_FUNC(gesvd, cdouble, double, z) + +#endif + + template + void svdInPlace(Array &s, Array &u, Array &vt, Array &in) + { + dim4 iDims = in.dims(); + int M = iDims[0]; + int N = iDims[1]; + + Tr *sPtr = getMappedPtr(s.get()); + T *uPtr = getMappedPtr(u.get()); + T *vPtr = getMappedPtr(vt.get()); + T *iPtr = getMappedPtr(in.get()); + +#if defined(USE_MKL) || defined(__APPLE__) + svd_func()(AF_LAPACK_COL_MAJOR, 'A', M, N, iPtr, in.strides()[1], + sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1]); +#else + std::vector superb(std::min(M, N)); + svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, iPtr, in.strides()[1], + sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1], &superb[0]); +#endif + } + + template + void svd(Array &s, Array &u, Array &vt, const Array &in) + { + Array in_copy = copyArray(in); + svdInPlace(s, u, vt, in_copy); + } + +#define INSTANTIATE_SVD(T, Tr) \ + template void svd(Array & s, Array & u, Array & vt, const Array &in); \ + template void svdInPlace(Array & s, Array & u, Array & vt, Array &in); + + INSTANTIATE_SVD(float , float ) + INSTANTIATE_SVD(double , double) + INSTANTIATE_SVD(cfloat , float ) + INSTANTIATE_SVD(cdouble, double) +} +} diff --git a/src/backend/opencl/cpu/cpu_svd.hpp b/src/backend/opencl/cpu/cpu_svd.hpp new file mode 100644 index 0000000000..4f271af8b9 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_svd.hpp @@ -0,0 +1,22 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + void svd(Array &s, Array &u, Array &vt, const Array &in); + + template + void svdInPlace(Array &s, Array &u, Array &vt, Array &in); +} +} diff --git a/src/backend/opencl/svd.cpp b/src/backend/opencl/svd.cpp index 77f7c8aa37..61da27bdcd 100644 --- a/src/backend/opencl/svd.cpp +++ b/src/backend/opencl/svd.cpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include namespace opencl { @@ -196,6 +198,10 @@ void svd(Array &arrU, template void svdInPlace(Array &s, Array &u, Array &vt, Array &in) { + if(OpenCLCPUOffload()) { + return cpu::svdInPlace(s, u, vt, in); + } + initBlas(); svd(u, s, vt, in, true); } @@ -203,6 +209,10 @@ void svdInPlace(Array &s, Array &u, Array &vt, Array &in) template void svd(Array &s, Array &u, Array &vt, const Array &in) { + if(OpenCLCPUOffload()) { + return cpu::svd(s, u, vt, in); + } + dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; From ffb191cbce56297e391f2f12ec45351dd8ebf1d8 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 13:08:01 -0500 Subject: [PATCH 154/288] Add OpenCL-CPU fallback for solve --- src/backend/opencl/cpu/cpu_solve.cpp | 187 +++++++++++++++++++++++++++ src/backend/opencl/cpu/cpu_solve.hpp | 23 ++++ src/backend/opencl/solve.cpp | 11 ++ 3 files changed, 221 insertions(+) create mode 100644 src/backend/opencl/cpu/cpu_solve.cpp create mode 100644 src/backend/opencl/cpu/cpu_solve.hpp diff --git a/src/backend/opencl/cpu/cpu_solve.cpp b/src/backend/opencl/cpu/cpu_solve.cpp new file mode 100644 index 0000000000..824bce2173 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_solve.cpp @@ -0,0 +1,187 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include + +#include + +namespace opencl +{ +namespace cpu +{ + +template +using gesv_func_def = int (*)(ORDER_TYPE, int, int, + T *, int, + int *, + T *, int); + +template +using gels_func_def = int (*)(ORDER_TYPE, char, + int, int, int, + T *, int, + T *, int); + +template +using getrs_func_def = int (*)(ORDER_TYPE, char, + int, int, + const T *, int, + const int *, + T *, int); + +template +using trtrs_func_def = int (*)(ORDER_TYPE, + char, char, char, + int, int, + const T *, int, + T *, int); + + +#define SOLVE_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + + +#define SOLVE_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +SOLVE_FUNC_DEF( gesv ) +SOLVE_FUNC(gesv , float , s) +SOLVE_FUNC(gesv , double , d) +SOLVE_FUNC(gesv , cfloat , c) +SOLVE_FUNC(gesv , cdouble, z) + +SOLVE_FUNC_DEF( gels ) +SOLVE_FUNC(gels , float , s) +SOLVE_FUNC(gels , double , d) +SOLVE_FUNC(gels , cfloat , c) +SOLVE_FUNC(gels , cdouble, z) + +SOLVE_FUNC_DEF( getrs ) +SOLVE_FUNC(getrs , float , s) +SOLVE_FUNC(getrs , double , d) +SOLVE_FUNC(getrs , cfloat , c) +SOLVE_FUNC(getrs , cdouble, z) + +SOLVE_FUNC_DEF( trtrs ) +SOLVE_FUNC(trtrs , float , s) +SOLVE_FUNC(trtrs , double , d) +SOLVE_FUNC(trtrs , cfloat , c) +SOLVE_FUNC(trtrs , cdouble, z) + +template +Array solveLU(const Array &A, const Array &pivot, + const Array &b, const af_mat_prop options) +{ + int N = A.dims()[0]; + int NRHS = b.dims()[1]; + + Array B = copyArray(b); + + T *aPtr = getMappedPtr(A.get()); + T *bPtr = getMappedPtr(B.get()); + int *pPtr = getMappedPtr(pivot.get()); + + getrs_func()(AF_LAPACK_COL_MAJOR, 'N', + N, NRHS, + aPtr, A.strides()[1], + pPtr, + bPtr, B.strides()[1]); + + unmapPtr(A.get(), aPtr); + unmapPtr(B.get(), bPtr); + unmapPtr(pivot.get(), pPtr); + + return B; +} + +template +Array triangleSolve(const Array &A, const Array &b, const af_mat_prop options) +{ + Array B = copyArray(b); + int N = B.dims()[0]; + int NRHS = B.dims()[1]; + + T *aPtr = getMappedPtr(A.get()); + T *bPtr = getMappedPtr(B.get()); + + trtrs_func()(AF_LAPACK_COL_MAJOR, + options & AF_MAT_UPPER ? 'U' : 'L', + 'N', // transpose flag + options & AF_MAT_DIAG_UNIT ? 'U' : 'N', + N, NRHS, + aPtr, A.strides()[1], + bPtr, B.strides()[1]); + + unmapPtr(A.get(), aPtr); + unmapPtr(B.get(), bPtr); + + return B; +} + + +template +Array solve(const Array &a, const Array &b, const af_mat_prop options) +{ + + if (options & AF_MAT_UPPER || + options & AF_MAT_LOWER) { + return triangleSolve(a, b, options); + } + + int M = a.dims()[0]; + int N = a.dims()[1]; + int K = b.dims()[1]; + + Array A = copyArray(a); + Array B = padArray(b, dim4(max(M, N), K), scalar(0)); + + T *aPtr = getMappedPtr(A.get()); + T *bPtr = getMappedPtr(B.get()); + + if(M == N) { + std::vector pivot(N); + gesv_func()(AF_LAPACK_COL_MAJOR, N, K, + aPtr, A.strides()[1], + &pivot.front(), + bPtr, B.strides()[1]); + } else { + int sM = a.strides()[1]; + int sN = a.strides()[2] / sM; + + gels_func()(AF_LAPACK_COL_MAJOR, 'N', + M, N, K, + aPtr, A.strides()[1], + bPtr, max(sM, sN)); + B.resetDims(dim4(N, K)); + } + + unmapPtr(A.get(), aPtr); + unmapPtr(B.get(), bPtr); + + return B; +} + +#define INSTANTIATE_SOLVE(T) \ + template Array solve(const Array &a, const Array &b, \ + const af_mat_prop options); \ + template Array solveLU(const Array &A, const Array &pivot, \ + const Array &b, const af_mat_prop options); \ + +INSTANTIATE_SOLVE(float) +INSTANTIATE_SOLVE(cfloat) +INSTANTIATE_SOLVE(double) +INSTANTIATE_SOLVE(cdouble) + +} +} diff --git a/src/backend/opencl/cpu/cpu_solve.hpp b/src/backend/opencl/cpu/cpu_solve.hpp new file mode 100644 index 0000000000..6c3de642ad --- /dev/null +++ b/src/backend/opencl/cpu/cpu_solve.hpp @@ -0,0 +1,23 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + Array solve(const Array &a, const Array &b, const af_mat_prop options = AF_MAT_NONE); + + template + Array solveLU(const Array &a, const Array &pivot, + const Array &b, const af_mat_prop options = AF_MAT_NONE); +} +} diff --git a/src/backend/opencl/solve.cpp b/src/backend/opencl/solve.cpp index 6d2bea4b4e..4fede07e56 100644 --- a/src/backend/opencl/solve.cpp +++ b/src/backend/opencl/solve.cpp @@ -25,6 +25,9 @@ #include #include +#include +#include + namespace opencl { @@ -32,6 +35,10 @@ template Array solveLU(const Array &A, const Array &pivot, const Array &b, const af_mat_prop options) { + if(OpenCLCPUOffload()) { + return cpu::solveLU(A, pivot, b, options); + } + int N = A.dims()[0]; int NRHS = b.dims()[1]; @@ -296,6 +303,10 @@ template Array solve(const Array &a, const Array &b, const af_mat_prop options) { try { + if(OpenCLCPUOffload()) { + return cpu::solve(a, b, options); + } + initBlas(); if (options & AF_MAT_UPPER || From 4e2d46cec61b0226f3e1cbe558b58aa21c4f4cc5 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 15:26:08 -0500 Subject: [PATCH 155/288] Add OpenCL-CPU fallback for inverse --- src/backend/opencl/cpu/cpu_inverse.cpp | 77 ++++++++++++++++++++++++++ src/backend/opencl/cpu/cpu_inverse.hpp | 19 +++++++ src/backend/opencl/inverse.cpp | 6 ++ 3 files changed, 102 insertions(+) create mode 100644 src/backend/opencl/cpu/cpu_inverse.cpp create mode 100644 src/backend/opencl/cpu/cpu_inverse.hpp diff --git a/src/backend/opencl/cpu/cpu_inverse.cpp b/src/backend/opencl/cpu/cpu_inverse.cpp new file mode 100644 index 0000000000..f1418b23bd --- /dev/null +++ b/src/backend/opencl/cpu/cpu_inverse.cpp @@ -0,0 +1,77 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include + +namespace opencl +{ +namespace cpu +{ + +template +using getri_func_def = int (*)(ORDER_TYPE, int, + T *, int, + const int *); + +#define INV_FUNC_DEF( FUNC ) \ +template FUNC##_func_def FUNC##_func(); + +#define INV_FUNC( FUNC, TYPE, PREFIX ) \ +template<> FUNC##_func_def FUNC##_func() \ +{ return & LAPACK_NAME(PREFIX##FUNC); } + +INV_FUNC_DEF( getri ) +INV_FUNC(getri , float , s) +INV_FUNC(getri , double , d) +INV_FUNC(getri , cfloat , c) +INV_FUNC(getri , cdouble, z) + +template +Array inverse(const Array &in) +{ + int M = in.dims()[0]; + //int N = in.dims()[1]; + + // This condition is already handled in opencl/inverse.cpp + //if (M != N) { + //Array I = identity(in.dims()); + //return solve(in, I); + //} + + Array A = copyArray(in); + + Array pivot = cpu::lu_inplace(A, false); + + T *aPtr = getMappedPtr(A.get()); + int *pPtr = getMappedPtr(pivot.get()); + + getri_func()(AF_LAPACK_COL_MAJOR, M, + aPtr, A.strides()[1], + pPtr); + + unmapPtr(A.get(), aPtr); + unmapPtr(pivot.get(), pPtr); + + return A; +} + +#define INSTANTIATE(T) \ + template Array inverse (const Array &in); + +INSTANTIATE(float) +INSTANTIATE(cfloat) +INSTANTIATE(double) +INSTANTIATE(cdouble) + +} +} diff --git a/src/backend/opencl/cpu/cpu_inverse.hpp b/src/backend/opencl/cpu/cpu_inverse.hpp new file mode 100644 index 0000000000..38581a1906 --- /dev/null +++ b/src/backend/opencl/cpu/cpu_inverse.hpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace opencl +{ +namespace cpu +{ + template + Array inverse(const Array &in); +} +} diff --git a/src/backend/opencl/inverse.cpp b/src/backend/opencl/inverse.cpp index eb8348edd4..df955547ba 100644 --- a/src/backend/opencl/inverse.cpp +++ b/src/backend/opencl/inverse.cpp @@ -12,6 +12,8 @@ #include #if defined(WITH_OPENCL_LINEAR_ALGEBRA) +#include +#include namespace opencl { @@ -19,6 +21,10 @@ namespace opencl template Array inverse(const Array &in) { + if(OpenCLCPUOffload()) { + if (in.dims()[0] == in.dims()[1]) + return cpu::inverse(in); + } Array I = identity(in.dims()); return solve(in, I); } From 210a64cbb6c824d483f1ed201e78d151423c7a46 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 16:56:12 -0500 Subject: [PATCH 156/288] Clean up header files in opencl/cpu/ --- src/backend/opencl/cpu/cpu_blas.cpp | 7 ++-- src/backend/opencl/cpu/cpu_blas.hpp | 2 +- src/backend/opencl/cpu/cpu_cholesky.cpp | 10 ++---- src/backend/opencl/cpu/cpu_helper.hpp | 37 ++++++++++++++------ src/backend/opencl/cpu/cpu_inverse.cpp | 5 ++- src/backend/opencl/cpu/cpu_lapack_helper.hpp | 35 ------------------ src/backend/opencl/cpu/cpu_lu.cpp | 11 ++---- src/backend/opencl/cpu/cpu_qr.cpp | 11 ++---- src/backend/opencl/cpu/cpu_solve.cpp | 5 +-- src/backend/opencl/cpu/cpu_svd.cpp | 8 +++-- src/backend/opencl/cpu/cpu_triangle.hpp | 3 ++ 11 files changed, 48 insertions(+), 86 deletions(-) delete mode 100644 src/backend/opencl/cpu/cpu_lapack_helper.hpp diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp index ff7170d331..8c77fff8fd 100644 --- a/src/backend/opencl/cpu/cpu_blas.cpp +++ b/src/backend/opencl/cpu/cpu_blas.cpp @@ -7,12 +7,9 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#include #include -#include -#include -#include -#include -#include +#include namespace opencl { diff --git a/src/backend/opencl/cpu/cpu_blas.hpp b/src/backend/opencl/cpu/cpu_blas.hpp index 836d6e02de..908742471d 100644 --- a/src/backend/opencl/cpu/cpu_blas.hpp +++ b/src/backend/opencl/cpu/cpu_blas.hpp @@ -7,7 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace opencl { diff --git a/src/backend/opencl/cpu/cpu_cholesky.cpp b/src/backend/opencl/cpu/cpu_cholesky.cpp index 234df2b242..74bbf594ae 100644 --- a/src/backend/opencl/cpu/cpu_cholesky.cpp +++ b/src/backend/opencl/cpu/cpu_cholesky.cpp @@ -7,16 +7,10 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include #include -#include - -#include -#include -#include -#include - #include +#include namespace opencl { diff --git a/src/backend/opencl/cpu/cpu_helper.hpp b/src/backend/opencl/cpu/cpu_helper.hpp index afc60d3b9f..d407bb83cc 100644 --- a/src/backend/opencl/cpu/cpu_helper.hpp +++ b/src/backend/opencl/cpu/cpu_helper.hpp @@ -7,22 +7,38 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#ifndef AF_OPENCL_CPU +#define AF_OPENCL_CPU + #include -#include -#include #include #include +#include +#include +#include + +#define lapack_complex_float opencl::cfloat +#define lapack_complex_double opencl::cdouble +#define LAPACK_PREFIX LAPACKE_ +#define ORDER_TYPE int +#define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR +#define LAPACK_NAME(fn) LAPACKE_##fn #ifdef __APPLE__ -#include -#else -#ifdef USE_MKL -#include + #include + #include + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 #else -extern "C" { -#include -} -#endif + #ifdef USE_MKL + #include + #include + #else + extern "C" { + #include + } + #include + #endif #endif // TODO: Ask upstream for a more official way to detect it @@ -44,3 +60,4 @@ namespace cpu } } +#endif diff --git a/src/backend/opencl/cpu/cpu_inverse.cpp b/src/backend/opencl/cpu/cpu_inverse.cpp index f1418b23bd..24b4a670fd 100644 --- a/src/backend/opencl/cpu/cpu_inverse.cpp +++ b/src/backend/opencl/cpu/cpu_inverse.cpp @@ -7,11 +7,10 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include #include -#include -#include #include +#include namespace opencl { diff --git a/src/backend/opencl/cpu/cpu_lapack_helper.hpp b/src/backend/opencl/cpu/cpu_lapack_helper.hpp deleted file mode 100644 index 174022e772..0000000000 --- a/src/backend/opencl/cpu/cpu_lapack_helper.hpp +++ /dev/null @@ -1,35 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#ifndef AFCPU_LAPACK -#define AFCPU_LAPACK - -#include - -#define lapack_complex_float opencl::cfloat -#define lapack_complex_double opencl::cdouble -#define LAPACK_PREFIX LAPACKE_ -#define ORDER_TYPE int -#define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR -#define LAPACK_NAME(fn) LAPACKE_##fn - -#ifdef __APPLE__ -#include -#include -#undef AF_LAPACK_COL_MAJOR -#define AF_LAPACK_COL_MAJOR 0 -#else -#ifdef USE_MKL -#include -#else // NETLIB LAPACKE -#include -#endif -#endif - -#endif diff --git a/src/backend/opencl/cpu/cpu_lu.cpp b/src/backend/opencl/cpu/cpu_lu.cpp index f415cb3983..293cb8af86 100644 --- a/src/backend/opencl/cpu/cpu_lu.cpp +++ b/src/backend/opencl/cpu/cpu_lu.cpp @@ -7,15 +7,10 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include #include -#include - -#include -#include -#include -#include - +#include +#include #include namespace opencl diff --git a/src/backend/opencl/cpu/cpu_qr.cpp b/src/backend/opencl/cpu/cpu_qr.cpp index 080ebb6b69..24a915a5d1 100644 --- a/src/backend/opencl/cpu/cpu_qr.cpp +++ b/src/backend/opencl/cpu/cpu_qr.cpp @@ -7,17 +7,10 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include #include -#include -#include - -#include -#include -#include -#include - #include +#include namespace opencl { diff --git a/src/backend/opencl/cpu/cpu_solve.cpp b/src/backend/opencl/cpu/cpu_solve.cpp index 824bce2173..522454aa81 100644 --- a/src/backend/opencl/cpu/cpu_solve.cpp +++ b/src/backend/opencl/cpu/cpu_solve.cpp @@ -7,14 +7,11 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include #include -#include #include #include -#include - namespace opencl { namespace cpu diff --git a/src/backend/opencl/cpu/cpu_svd.cpp b/src/backend/opencl/cpu/cpu_svd.cpp index 85b9ee8280..66e4c0a7c5 100644 --- a/src/backend/opencl/cpu/cpu_svd.cpp +++ b/src/backend/opencl/cpu/cpu_svd.cpp @@ -7,10 +7,8 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include #include -#include - #include namespace opencl @@ -82,6 +80,10 @@ namespace cpu svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, iPtr, in.strides()[1], sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1], &superb[0]); #endif + unmapPtr(s.get() , sPtr); + unmapPtr(u.get() , uPtr); + unmapPtr(vt.get(), vPtr); + unmapPtr(in.get(), iPtr); } template diff --git a/src/backend/opencl/cpu/cpu_triangle.hpp b/src/backend/opencl/cpu/cpu_triangle.hpp index 5e40f929b9..f953d58507 100644 --- a/src/backend/opencl/cpu/cpu_triangle.hpp +++ b/src/backend/opencl/cpu/cpu_triangle.hpp @@ -9,6 +9,9 @@ #ifndef CPU_LAPACK_TRIANGLE #define CPU_LAPACK_TRIANGLE + +#include + namespace opencl { namespace cpu From e08d41bcced48eadca8c7f83e01eee616b0dc62a Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 17:10:34 -0500 Subject: [PATCH 157/288] Update environment variables doc --- .../configuring_arrayfire_environment.md | 53 +++++++++++++++---- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/docs/pages/configuring_arrayfire_environment.md b/docs/pages/configuring_arrayfire_environment.md index 054068e224..3de8fbe295 100644 --- a/docs/pages/configuring_arrayfire_environment.md +++ b/docs/pages/configuring_arrayfire_environment.md @@ -18,6 +18,16 @@ This is the path with ArrayFire gets installed, ie. the includes and libs are present in this directory. You can use this variable to add include paths and libraries to your projects. +AF_PRINT_ERRORS {#af_print_errors} +------------------------------------------------------------------------------- + +When AF_PRINT_ERRORS is set to 1, the exceptions thrown are more verbose and +detailed. This helps in locating the exact failure. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +AF_PRINT_ERRORS=1 ./myprogram_opencl +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + AF_CUDA_DEFAULT_DEVICE {#af_cuda_default_device} ------------------------------------------------------------------------------- @@ -44,25 +54,48 @@ AF_OPENCL_DEFAULT_DEVICE=1 ./myprogram_opencl Note: af::setDevice call in the source code will take precedence over this variable. +AF_OPENCL_CPU_OFFLOAD {#af_opencl_cpu_offload} +------------------------------------------------------------------------------- + +When this variable is set to 1, and the selected OpenCL device has unified +memory with the host (ie. `CL_DEVICE_HOST_UNIFIED_MEMORY` is true for device), +then certain functions are offloaded to run on the CPU using mapped buffers. + +This takes advantage of fast libraries such as MKL while spending no time +copying memory from device to host. The device memory is mapped to a host +pointer which can be used in the offloaded functions. + +AF_OPENCL_SHOW_BUILD_INFO {#af_opencl_show_build_info} +------------------------------------------------------------------------------- + +This variable is useful when debuggin OpenCL kernel compilation failures. When +this variable is set to 1, and an error occurs during a OpenCL kernel +compilation, then the log and kernel are printed to screen. + AF_DISABLE_GRAPHICS {#af_disable_graphics} ------------------------------------------------------------------------------- -Setting this variable will disable window creation when graphics functions are -being called. Simply setting this variable will disable functionality, any -value will suffice. Disabling window creation will disable all other graphics -calls at runtime as well. +Setting this variable to 1 will disable window creation when graphics +functions are being called. Disabling window creation will disable all other +graphics calls at runtime as well. This is a useful enviornment variable when running code on servers and systems without displays. When graphics calls are run on such machines, they will print warning about window creation failing. To suppress those calls, set this variable. -AF_PRINT_ERRORS {#af_print_errors} +AF_SYNCHRONOUS_CALLS {#af_synchronous_calls} ------------------------------------------------------------------------------- -When AF_PRINT_ERRORS is set to 1, the exceptions thrown are more verbose and -detailed. This helps in locating the exact failure. +When this environment variable is set to 1, ArrayFire will execute all +functions synchronously. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -AF_PRINT_ERRORS=1 ./myprogram_opencl -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +AF_SHOW_LOAD_PATH {#af_show_load_path} +------------------------------------------------------------------------------- + +When using the Unified backend, if this variable is set to 1, it will show the +path where the ArrayFire backend libraries are loaded from. + +If the libraries are loaded from system paths, such as PATH or LD_LIBRARY_PATH +etc, then it will print "system path". If the libraries are loaded from other +paths, then those paths are shown in full. From 685dccd363e8da0e95b00b8ac73f70254f68d072 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 18:31:55 -0500 Subject: [PATCH 158/288] Update boost compute release tag --- CMakeModules/build_boost_compute.cmake | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/CMakeModules/build_boost_compute.cmake b/CMakeModules/build_boost_compute.cmake index c0de1cb291..03c20435a8 100644 --- a/CMakeModules/build_boost_compute.cmake +++ b/CMakeModules/build_boost_compute.cmake @@ -1,6 +1,9 @@ -SET(VER 79aa8f9086fdf6ef6db78e889de0273b0eb7bd19) -SET(URL https://github.com/boostorg/compute/archive/${VER}.tar.gz) -SET(MD5 dba3318cbdac912dddce71f2a38ffa43) +# If using a commit, remove the v prefix to VER in URL. +# If using a tag, don't use v in VER +# This is because of how github handles it's release tar balls +SET(VER 0.5) +SET(URL https://github.com/boostorg/compute/archive/v${VER}.tar.gz) +SET(MD5 69a52598ac539d3b7f6005a3dd2b6f58) SET(thirdPartyDir "${CMAKE_BINARY_DIR}/third_party") SET(srcDir "${thirdPartyDir}/compute-${VER}") From 6b7b1ce4ac32ea4dc9442a343822f90ada95cd37 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 18:32:11 -0500 Subject: [PATCH 159/288] Update clFFT release tag --- CMakeModules/build_clFFT.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeModules/build_clFFT.cmake b/CMakeModules/build_clFFT.cmake index e1dbb3fe1c..961347f913 100644 --- a/CMakeModules/build_clFFT.cmake +++ b/CMakeModules/build_clFFT.cmake @@ -14,7 +14,7 @@ ENDIF() ExternalProject_Add( clFFT-ext GIT_REPOSITORY https://github.com/arrayfire/clFFT.git - GIT_TAG 1597f0f35a644789c7ad77efe79014236cca2fab + GIT_TAG arrayfire-release-test PREFIX "${prefix}" INSTALL_DIR "${prefix}" UPDATE_COMMAND "" From b36d003e56222a2888184e985442339e8e5af567 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 22:11:41 -0500 Subject: [PATCH 160/288] Move MappedPtr into the Array class --- src/backend/opencl/Array.hpp | 43 +++++++++++++++++++++++++ src/backend/opencl/cpu/cpu_blas.cpp | 22 ++++++------- src/backend/opencl/cpu/cpu_cholesky.cpp | 14 ++++---- src/backend/opencl/cpu/cpu_inverse.cpp | 12 +++---- src/backend/opencl/cpu/cpu_lu.cpp | 37 +++++++++++---------- src/backend/opencl/cpu/cpu_qr.cpp | 27 ++++++---------- src/backend/opencl/cpu/cpu_solve.cpp | 42 +++++++++--------------- src/backend/opencl/cpu/cpu_svd.cpp | 29 ++++++++++------- src/backend/opencl/memory.cpp | 27 +--------------- src/backend/opencl/memory.hpp | 3 -- 10 files changed, 128 insertions(+), 128 deletions(-) diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index 5f86d6d0b6..abce5b9166 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace opencl { @@ -209,6 +210,48 @@ namespace opencl JIT::Node_ptr getNode() const; + private: + bool is_const() const + { + return true; + } + + bool is_const() + { + return false; + } + + public: + std::shared_ptr getMappedPtr() const + { + auto func = [=] (void* ptr) { + try { + if(ptr != nullptr) + getQueue().enqueueUnmapMemObject(*data, ptr); + ptr = nullptr; + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); + } + }; + + T *ptr = nullptr; + try { + if(ptr == nullptr) { + if(is_const()) { + ptr = (T*)getQueue().enqueueMapBuffer(*const_cast(get()), true, CL_MAP_READ, + getOffset(), getDataDims().elements() * sizeof(T)); + } else { + ptr = (T*)getQueue().enqueueMapBuffer(*(get()), true, CL_MAP_READ|CL_MAP_WRITE, + getOffset(), getDataDims().elements() * sizeof(T)); + } + } + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); + } + + return std::shared_ptr(ptr, func); + } + friend Array createValueArray(const af::dim4 &size, const T& value); friend Array createHostDataArray(const af::dim4 &size, const T * const data); friend Array createDeviceDataArray(const af::dim4 &size, const void *data); diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp index 8c77fff8fd..1ff7e145d6 100644 --- a/src/backend/opencl/cpu/cpu_blas.cpp +++ b/src/backend/opencl/cpu/cpu_blas.cpp @@ -167,9 +167,9 @@ Array matmul(const Array &lhs, const Array &rhs, using BT = typename blas_base::type; // get host pointers from mapped memory - BT *lPtr = getMappedPtr(lhs.get()); - BT *rPtr = getMappedPtr(rhs.get()); - BT *oPtr = getMappedPtr(out.get()); + std::shared_ptr lPtr = lhs.getMappedPtr(); + std::shared_ptr rPtr = rhs.getMappedPtr(); + std::shared_ptr oPtr = out.getMappedPtr(); if(rDims[bColDim] == 1) { N = lDims[aColDim]; @@ -177,25 +177,21 @@ Array matmul(const Array &lhs, const Array &rhs, CblasColMajor, lOpts, lDims[0], lDims[1], alpha, - lPtr, lStrides[1], - rPtr, rStrides[0], + lPtr.get(), lStrides[1], + rPtr.get(), rStrides[0], beta, - oPtr, 1); + oPtr.get(), 1); } else { gemm_func()( CblasColMajor, lOpts, rOpts, M, N, K, alpha, - lPtr, lStrides[1], - rPtr, rStrides[1], + lPtr.get(), lStrides[1], + rPtr.get(), rStrides[1], beta, - oPtr, out.dims()[0]); + oPtr.get(), out.dims()[0]); } - unmapPtr(lhs.get(), lPtr); - unmapPtr(rhs.get(), rPtr); - unmapPtr(out.get(), oPtr); - return out; } diff --git a/src/backend/opencl/cpu/cpu_cholesky.cpp b/src/backend/opencl/cpu/cpu_cholesky.cpp index 74bbf594ae..bd871d7518 100644 --- a/src/backend/opencl/cpu/cpu_cholesky.cpp +++ b/src/backend/opencl/cpu/cpu_cholesky.cpp @@ -42,10 +42,10 @@ Array cholesky(int *info, const Array &in, const bool is_upper) Array out = copyArray(in); *info = cholesky_inplace(out, is_upper); - T* oPtr = getMappedPtr(out.get()); - if (is_upper) triangle(oPtr, oPtr, out.dims(), out.strides(), out.strides()); - else triangle(oPtr, oPtr, out.dims(), out.strides(), out.strides()); - unmapPtr(out.get(), oPtr); + std::shared_ptr oPtr = out.getMappedPtr(); + + if (is_upper) triangle(oPtr.get(), oPtr.get(), out.dims(), out.strides(), out.strides()); + else triangle(oPtr.get(), oPtr.get(), out.dims(), out.strides(), out.strides()); return out; } @@ -60,10 +60,10 @@ int cholesky_inplace(Array &in, const bool is_upper) if(is_upper) uplo = 'U'; - T* inPtr = getMappedPtr(in.get()); + std::shared_ptr inPtr = in.getMappedPtr(); + int info = potrf_func()(AF_LAPACK_COL_MAJOR, uplo, - N, inPtr, in.strides()[1]); - unmapPtr(in.get(), inPtr); + N, inPtr.get(), in.strides()[1]); return info; } diff --git a/src/backend/opencl/cpu/cpu_inverse.cpp b/src/backend/opencl/cpu/cpu_inverse.cpp index 24b4a670fd..fee171929a 100644 --- a/src/backend/opencl/cpu/cpu_inverse.cpp +++ b/src/backend/opencl/cpu/cpu_inverse.cpp @@ -51,15 +51,13 @@ Array inverse(const Array &in) Array pivot = cpu::lu_inplace(A, false); - T *aPtr = getMappedPtr(A.get()); - int *pPtr = getMappedPtr(pivot.get()); - getri_func()(AF_LAPACK_COL_MAJOR, M, - aPtr, A.strides()[1], - pPtr); + std::shared_ptr aPtr = A.getMappedPtr(); + std::shared_ptr pPtr = pivot.getMappedPtr(); - unmapPtr(A.get(), aPtr); - unmapPtr(pivot.get(), pPtr); + getri_func()(AF_LAPACK_COL_MAJOR, M, + aPtr.get(), A.strides()[1], + pPtr.get()); return A; } diff --git a/src/backend/opencl/cpu/cpu_lu.cpp b/src/backend/opencl/cpu/cpu_lu.cpp index 293cb8af86..3eb574e743 100644 --- a/src/backend/opencl/cpu/cpu_lu.cpp +++ b/src/backend/opencl/cpu/cpu_lu.cpp @@ -40,9 +40,13 @@ LU_FUNC(getrf , cdouble, z) template void lu_split(Array &lower, Array &upper, const Array &in) { - T *l = getMappedPtr(lower.get()); - T *u = getMappedPtr(upper.get()); - T *i = getMappedPtr(in.get()); + std::shared_ptr ls = lower.getMappedPtr(); + std::shared_ptr us = upper.getMappedPtr(); + std::shared_ptr is = in.getMappedPtr(); + + T *l = ls.get(); + T *u = us.get(); + T *i = is.get(); dim4 ldm = lower.dims(); dim4 udm = upper.dims(); @@ -91,18 +95,17 @@ void lu_split(Array &lower, Array &upper, const Array &in) } } } - - unmapPtr(lower.get(), l); - unmapPtr(upper.get(), u); - unmapPtr(in.get(), i); } void convertPivot(Array &pivot, int out_sz) { Array p = range(dim4(out_sz), 0); // Runs opencl - int *d_pi = getMappedPtr(pivot.get()); - int *d_po = getMappedPtr(p.get()); + std::shared_ptr pi = pivot.getMappedPtr(); + std::shared_ptr po = p.getMappedPtr(); + + int *d_pi = pi.get(); + int *d_po = po.get(); dim_t d0 = pivot.dims()[0]; @@ -111,8 +114,8 @@ void convertPivot(Array &pivot, int out_sz) std::swap(d_po[j], d_po[d_pi[j] - 1]); } - unmapPtr(pivot.get(), d_pi); - unmapPtr(p.get(), d_po); + pi.reset(); + po.reset(); pivot = p; } @@ -145,15 +148,15 @@ Array lu_inplace(Array &in, const bool convert_pivot) Array pivot = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); - T *inPtr = getMappedPtr(in.get()); - int *pivotPtr = getMappedPtr(pivot.get()); + std::shared_ptr inPtr = in.getMappedPtr(); + std::shared_ptr piPtr = pivot.getMappedPtr(); getrf_func()(AF_LAPACK_COL_MAJOR, M, N, - inPtr, in.strides()[1], - pivotPtr); + inPtr.get(), in.strides()[1], + piPtr.get()); - unmapPtr(in.get(), inPtr); - unmapPtr(pivot.get(), pivotPtr); + inPtr.reset(); + piPtr.reset(); if(convert_pivot) convertPivot(pivot, M); diff --git a/src/backend/opencl/cpu/cpu_qr.cpp b/src/backend/opencl/cpu/cpu_qr.cpp index 24a915a5d1..32eca92963 100644 --- a/src/backend/opencl/cpu/cpu_qr.cpp +++ b/src/backend/opencl/cpu/cpu_qr.cpp @@ -70,20 +70,16 @@ void qr(Array &q, Array &r, Array &t, const Array &in) dim4 rdims(M, N); r = createEmptyArray(rdims); - T *qPtr = getMappedPtr(q.get()); - T *rPtr = getMappedPtr(r.get()); - T *tPtr = getMappedPtr(t.get()); + std::shared_ptr qPtr = q.getMappedPtr(); + std::shared_ptr rPtr = r.getMappedPtr(); + std::shared_ptr tPtr = t.getMappedPtr(); - triangle(rPtr, qPtr, rdims, r.strides(), q.strides()); + triangle(rPtr.get(), qPtr.get(), rdims, r.strides(), q.strides()); gqr_func()(AF_LAPACK_COL_MAJOR, M, M, min(M, N), - qPtr, q.strides()[1], - tPtr); - - unmapPtr(q.get(), qPtr); - unmapPtr(r.get(), rPtr); - unmapPtr(t.get(), tPtr); + qPtr.get(), q.strides()[1], + tPtr.get()); q.resetDims(dim4(M, M)); } @@ -97,15 +93,12 @@ Array qr_inplace(Array &in) Array t = createEmptyArray(af::dim4(min(M, N), 1, 1, 1)); - T *iPtr = getMappedPtr(in.get()); - T *tPtr = getMappedPtr(t.get()); + std::shared_ptr iPtr = in.getMappedPtr(); + std::shared_ptr tPtr = t.getMappedPtr(); geqrf_func()(AF_LAPACK_COL_MAJOR, M, N, - iPtr, in.strides()[1], - tPtr); - - unmapPtr(in.get(), iPtr); - unmapPtr(t.get(), tPtr); + iPtr.get(), in.strides()[1], + tPtr.get()); return t; } diff --git a/src/backend/opencl/cpu/cpu_solve.cpp b/src/backend/opencl/cpu/cpu_solve.cpp index 522454aa81..9e4f0932ac 100644 --- a/src/backend/opencl/cpu/cpu_solve.cpp +++ b/src/backend/opencl/cpu/cpu_solve.cpp @@ -85,19 +85,15 @@ Array solveLU(const Array &A, const Array &pivot, Array B = copyArray(b); - T *aPtr = getMappedPtr(A.get()); - T *bPtr = getMappedPtr(B.get()); - int *pPtr = getMappedPtr(pivot.get()); + std::shared_ptr aPtr = A.getMappedPtr(); + std::shared_ptr bPtr = B.getMappedPtr(); + std::shared_ptr pPtr = pivot.getMappedPtr(); getrs_func()(AF_LAPACK_COL_MAJOR, 'N', N, NRHS, - aPtr, A.strides()[1], - pPtr, - bPtr, B.strides()[1]); - - unmapPtr(A.get(), aPtr); - unmapPtr(B.get(), bPtr); - unmapPtr(pivot.get(), pPtr); + aPtr.get(), A.strides()[1], + pPtr.get(), + bPtr.get(), B.strides()[1]); return B; } @@ -109,19 +105,16 @@ Array triangleSolve(const Array &A, const Array &b, const af_mat_prop o int N = B.dims()[0]; int NRHS = B.dims()[1]; - T *aPtr = getMappedPtr(A.get()); - T *bPtr = getMappedPtr(B.get()); + std::shared_ptr aPtr = A.getMappedPtr(); + std::shared_ptr bPtr = B.getMappedPtr(); trtrs_func()(AF_LAPACK_COL_MAJOR, options & AF_MAT_UPPER ? 'U' : 'L', 'N', // transpose flag options & AF_MAT_DIAG_UNIT ? 'U' : 'N', N, NRHS, - aPtr, A.strides()[1], - bPtr, B.strides()[1]); - - unmapPtr(A.get(), aPtr); - unmapPtr(B.get(), bPtr); + aPtr.get(), A.strides()[1], + bPtr.get(), B.strides()[1]); return B; } @@ -143,29 +136,26 @@ Array solve(const Array &a, const Array &b, const af_mat_prop options) Array A = copyArray(a); Array B = padArray(b, dim4(max(M, N), K), scalar(0)); - T *aPtr = getMappedPtr(A.get()); - T *bPtr = getMappedPtr(B.get()); + std::shared_ptr aPtr = A.getMappedPtr(); + std::shared_ptr bPtr = B.getMappedPtr(); if(M == N) { std::vector pivot(N); gesv_func()(AF_LAPACK_COL_MAJOR, N, K, - aPtr, A.strides()[1], + aPtr.get(), A.strides()[1], &pivot.front(), - bPtr, B.strides()[1]); + bPtr.get(), B.strides()[1]); } else { int sM = a.strides()[1]; int sN = a.strides()[2] / sM; gels_func()(AF_LAPACK_COL_MAJOR, 'N', M, N, K, - aPtr, A.strides()[1], - bPtr, max(sM, sN)); + aPtr.get(), A.strides()[1], + bPtr.get(), max(sM, sN)); B.resetDims(dim4(N, K)); } - unmapPtr(A.get(), aPtr); - unmapPtr(B.get(), bPtr); - return B; } diff --git a/src/backend/opencl/cpu/cpu_svd.cpp b/src/backend/opencl/cpu/cpu_svd.cpp index 66e4c0a7c5..c53df8ae78 100644 --- a/src/backend/opencl/cpu/cpu_svd.cpp +++ b/src/backend/opencl/cpu/cpu_svd.cpp @@ -67,23 +67,28 @@ namespace cpu int M = iDims[0]; int N = iDims[1]; - Tr *sPtr = getMappedPtr(s.get()); - T *uPtr = getMappedPtr(u.get()); - T *vPtr = getMappedPtr(vt.get()); - T *iPtr = getMappedPtr(in.get()); + std::shared_ptr sPtr = s.getMappedPtr(); + std::shared_ptr uPtr = u.getMappedPtr(); + std::shared_ptr vPtr = vt.getMappedPtr(); + std::shared_ptr iPtr = in.getMappedPtr(); #if defined(USE_MKL) || defined(__APPLE__) - svd_func()(AF_LAPACK_COL_MAJOR, 'A', M, N, iPtr, in.strides()[1], - sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1]); + svd_func()(AF_LAPACK_COL_MAJOR, 'A', + M, N, + iPtr.get(), in.strides()[1], + sPtr.get(), + uPtr.get(), u.strides()[1], + vPtr.get(), vt.strides()[1]); #else std::vector superb(std::min(M, N)); - svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', M, N, iPtr, in.strides()[1], - sPtr, uPtr, u.strides()[1], vPtr, vt.strides()[1], &superb[0]); + svd_func()(AF_LAPACK_COL_MAJOR, 'A', 'A', + M, N, + iPtr.get(), in.strides()[1], + sPtr.get(), + uPtr.get(), u.strides()[1], + vPtr.get(), vt.strides()[1], + &superb[0]); #endif - unmapPtr(s.get() , sPtr); - unmapPtr(u.get() , uPtr); - unmapPtr(vt.get(), vPtr); - unmapPtr(in.get(), iPtr); } template diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 924e370a64..cf3f4ccc4e 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace opencl { @@ -281,29 +282,6 @@ namespace opencl return bufferPush((cl::Buffer *)ptr); } - template - T *getMappedPtr(const cl::Buffer *buf) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find(const_cast(buf)); - - if (iter == memory_maps[n].end()) { - // Buffer not found in memory manager - // Very Very Bad - return NULL; - } - size_t alloc_bytes = iter->second.bytes; - - T *ptr = (T*)getQueue().enqueueMapBuffer( - *buf, true, CL_MAP_READ, 0, alloc_bytes); - return ptr; - } - - void unmapPtr(const cl::Buffer *buf, void *ptr) - { - getQueue().enqueueUnmapMemObject(*buf, ptr); - } - // pinned memory manager typedef struct { cl::Buffer *buf; @@ -426,7 +404,6 @@ namespace opencl template void memPush(const T* ptr); \ template T* pinnedAlloc(const size_t &elements); \ template void pinnedFree(T* ptr); \ - template T* getMappedPtr(const cl::Buffer *buf); \ INSTANTIATE(float) INSTANTIATE(cfloat) @@ -440,6 +417,4 @@ namespace opencl INSTANTIATE(uintl) INSTANTIATE(short) INSTANTIATE(ushort) - - template void* getMappedPtr(const cl::Buffer *buf); } diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index f337a7a1bd..96292cdfac 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -32,9 +32,6 @@ namespace opencl template void memPop(const T *ptr); template void memPush(const T *ptr); - template T *getMappedPtr(const cl::Buffer *buf); - void unmapPtr(const cl::Buffer *buf, void *ptr); - template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); From 56f9140d880b5816817aa8e3bd78c114492de56b Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 23:21:01 -0500 Subject: [PATCH 161/288] FEAT Add getActiveBackend function --- docs/details/backend.dox | 9 +++++++++ include/af/backend.h | 20 ++++++++++++++++++++ src/api/c/device.cpp | 6 ++++++ src/api/cpp/device.cpp | 7 +++++++ src/api/unified/device.cpp | 6 ++++++ test/backend.cpp | 15 +++++++++++++++ 6 files changed, 63 insertions(+) diff --git a/docs/details/backend.dox b/docs/details/backend.dox index 4d9cdf6f53..146cc14313 100644 --- a/docs/details/backend.dox +++ b/docs/details/backend.dox @@ -71,5 +71,14 @@ The return value specifies which backend the array was created on. ======================================================================= +\defgroup unified_func_getactivebackend getActiveBackend + +\brief Get's the backend enum for the active backend + +\ingroup unified_func +\ingroup arrayfire_func + +======================================================================= + @} */ diff --git a/include/af/backend.h b/include/af/backend.h index 93d8d8de58..0342ef0ade 100644 --- a/include/af/backend.h +++ b/include/af/backend.h @@ -55,6 +55,17 @@ AFAPI af_err af_get_available_backends(int* backends); AFAPI af_err af_get_backend_id(af_backend *backend, const af_array in); #endif +#if AF_API_VERSION >= 33 +/** + \param[out] backend takes one of the values of enum \ref af_backend + from the backend that is currently set to active + \returns \ref af_err error code + + \ingroup unified_func_getactivebackend + */ +AFAPI af_err af_get_active_backend(af_backend *backend); +#endif + #ifdef __cplusplus } #endif @@ -101,5 +112,14 @@ AFAPI int getAvailableBackends(); AFAPI af::Backend getBackendId(const array &in); #endif +#if AF_API_VERSION >= 33 +/** + \returns \ref af_backend which is the backend is currently active + + \ingroup unified_func_getctivebackend + */ +AFAPI af::Backend getActiveBackend(); +#endif + } #endif diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 8f332994e7..d782211367 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -53,6 +53,12 @@ af_err af_get_backend_id(af_backend *result, const af_array in) return AF_SUCCESS; } +af_err af_get_active_backend(af_backend *result) +{ + *result = (af_backend)getBackend(); + return AF_SUCCESS; +} + af_err af_init() { try { diff --git a/src/api/cpp/device.cpp b/src/api/cpp/device.cpp index 3b1609b9d4..5e4b0f7bf0 100644 --- a/src/api/cpp/device.cpp +++ b/src/api/cpp/device.cpp @@ -42,6 +42,13 @@ namespace af return result; } + af::Backend getActiveBackend() + { + af::Backend result = (af::Backend)0; + AF_THROW(af_get_active_backend(&result)); + return result; + } + void info() { AF_THROW(af_info()); diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp index f7e95569c9..fbd8e32f90 100644 --- a/src/api/unified/device.cpp +++ b/src/api/unified/device.cpp @@ -35,6 +35,12 @@ af_err af_get_backend_id(af_backend *result, const af_array in) return CALL(result, in); } +af_err af_get_active_backend(af_backend *result) +{ + *result = unified::AFSymbolManager::getInstance().getActiveBackend(); + return AF_SUCCESS; +} + af_err af_info() { return CALL_NO_PARAMS(); diff --git a/test/backend.cpp b/test/backend.cpp index 7b8dbddade..4bb5cdf7fe 100644 --- a/test/backend.cpp +++ b/test/backend.cpp @@ -21,11 +21,26 @@ using std::string; using std::vector; +const char *getActiveBackendString() +{ + af_backend active = (af_backend)0; + af_get_active_backend(&active); + + switch(active) { + case AF_BACKEND_CPU : return "AF_BACKEND_CPU"; + case AF_BACKEND_CUDA : return "AF_BACKEND_CUDA"; + case AF_BACKEND_OPENCL: return "AF_BACKEND_OPENCL"; + default : return "AF_BACKEND_DEFAULT"; + } +} + template void testFunction() { af_info(); + printf("Active Backend Enum = %s\n", getActiveBackendString()); + af_array outArray = 0; dim_t dims[] = {32, 32}; ASSERT_EQ(AF_SUCCESS, af_randu(&outArray, 2, dims, (af_dtype) af::dtype_traits::af_type)); From 3047acd599e86136578e80f5f35ca706456ffe7a Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sat, 9 Jan 2016 04:59:39 -0500 Subject: [PATCH 162/288] Add the ability to sort OpenCL devices Using the following criterion 1. GPUs > Accelerators > CPUs. 2. IN GPUs: a. Discreet preferred to integrated b. AMD > NVIDIA > APPLE > Intel / BEIGNET 3. IN CPUs Intel > AMD > POCL > other 4. While everything above is the same: a. Higher OpenCL compute version preferred b. Higher amount of memory preferred --- src/backend/opencl/platform.cpp | 215 +++++++++++++++++++++++--------- 1 file changed, 155 insertions(+), 60 deletions(-) diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 0cd46d25f6..3bf13c0690 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -110,6 +110,94 @@ void DeviceManager::setContext(int device) mActiveCtxId = device; } +static inline bool verify_present(std::string pname, const char *ref) +{ + return pname.find(ref) != std::string::npos; +} + +static inline bool compare_default(const Device *ldev, const Device *rdev) +{ + const cl_device_type device_types[] = {CL_DEVICE_TYPE_GPU, + CL_DEVICE_TYPE_ACCELERATOR}; + + auto l_dev_type = ldev->getInfo(); + auto r_dev_type = rdev->getInfo(); + + // This ensures GPU > ACCELERATOR > CPU + for (auto current_type : device_types) { + auto is_l_curr_type = l_dev_type == current_type; + auto is_r_curr_type = r_dev_type == current_type; + + if ( is_l_curr_type && !is_r_curr_type) return true; + if (!is_l_curr_type && is_r_curr_type) return false; + } + + // For GPUs, this ensures discreet > integrated + auto is_l_integrared = ldev->getInfo(); + auto is_r_integrared = rdev->getInfo(); + + if (!is_l_integrared && is_r_integrared) return true; + if ( is_l_integrared && !is_r_integrared) return false; + + // At this point, the devices are of same type. + // Sort based on emperical evidence of preferred platforms + + // Prefer AMD first + std::string lPlatName = getPlatformName(*ldev); + std::string rPlatName = getPlatformName(*rdev); + + if (l_dev_type == CL_DEVICE_TYPE_GPU && + r_dev_type == CL_DEVICE_TYPE_GPU ) { + // If GPU, prefer AMD > NVIDIA > Beignet / Intel > APPLE + const char *platforms[] = {"AMD", "NVIDIA", "APPLE", "INTEL", "BEIGNET"}; + + for (auto ref_name : platforms) { + if ( verify_present(lPlatName, ref_name) && + !verify_present(rPlatName, ref_name)) return true; + + if (!verify_present(lPlatName, ref_name) && + verify_present(rPlatName, ref_name)) return false; + } + + // Intel falls back to compare based on memory + } else { + // If CPU, prefer Intel > AMD > POCL > APPLE + const char *platforms[] = {"INTEL", "AMD", "POCL", "APPLE"}; + + for (auto ref_name : platforms) { + if ( verify_present(lPlatName, ref_name) && + !verify_present(rPlatName, ref_name)) return true; + + if (!verify_present(lPlatName, ref_name) && + verify_present(rPlatName, ref_name)) return false; + } + } + + + // Compare device compute versions + + { + // Check Device OpenCL Version + auto lversion = ldev->getInfo(); + auto rversion = rdev->getInfo(); + + auto lres = (lversion[7] > rversion[7]) || + ((lversion[7] == rversion[7]) && (lversion[9] > rversion[9])); + + auto rres = (lversion[7] < rversion[7]) || + ((lversion[7] == rversion[7]) && (lversion[9] < rversion[9])); + + if (lres > 0) return true; + if (rres < 0) return false; + } + + // Default crietria, sort based on memory + // Sort based on memory + auto l_mem = ldev->getInfo(); + auto r_mem = rdev->getInfo(); + return l_mem >= r_mem; +} + DeviceManager::DeviceManager() : mUserDeviceOffset(0), mActiveCtxId(0), mActiveQId(0) { @@ -117,41 +205,46 @@ DeviceManager::DeviceManager() std::vector platforms; Platform::get(&platforms); - cl_device_type DEVC_TYPES[] = { - CL_DEVICE_TYPE_GPU, -#ifndef OS_MAC - CL_DEVICE_TYPE_ACCELERATOR, - CL_DEVICE_TYPE_CPU + // This is all we need because the sort takes care of the order of devices +#ifdef OS_MAC + cl_device_type DEVICE_TYPES = CL_DEVICE_TYPE_GPU; +#else + cl_device_type DEVICE_TYPES = CL_DEVICE_TYPE_ALL; #endif - }; - - unsigned nDevices = 0; - for (auto devType : DEVC_TYPES) { - for (auto &platform : platforms) { - - cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, - (cl_context_properties)(platform()), - 0}; - - std::vector devs; - try { - platform.getDevices(devType, &devs); - } catch(const cl::Error &err) { - if (err.err() != CL_DEVICE_NOT_FOUND) { - throw; - } - } - for (auto dev : devs) { - nDevices++; - Context *ctx = new Context(dev, cps); - CommandQueue *cq = new CommandQueue(*ctx, dev); - mDevices.push_back(new Device(dev)); - mContexts.push_back(ctx); - mQueues.push_back(cq); - mIsGLSharingOn.push_back(false); + // Iterate through platforms, get all available devices and store them + for (auto &platform : platforms) { + std::vector current_devices; + + try { + platform.getDevices(DEVICE_TYPES, ¤t_devices); + } catch(const cl::Error &err) { + if (err.err() != CL_DEVICE_NOT_FOUND) { + throw; } } + + for (auto dev : current_devices) { + mDevices.push_back(new Device(dev)); + } + } + + // Sort OpenCL devices based on default criteria + std::stable_sort(mDevices.begin(), mDevices.end(), compare_default); + + // Create contexts and queues once the sort is done + int nDevices = mDevices.size(); + for (int i = 0; i < nDevices; i++) { + cl_platform_id device_platform = mDevices[i]->getInfo(); + cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, + (cl_context_properties)(device_platform), + 0}; + + Context *ctx = new Context(*mDevices[i], cps); + CommandQueue *cq = new CommandQueue(*ctx, *mDevices[i]); + mContexts.push_back(ctx); + mQueues.push_back(cq); + mIsGLSharingOn.push_back(false); } std::string deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE"); @@ -204,11 +297,12 @@ static std::string platformMap(std::string &platStr) typedef std::map strmap_t; static strmap_t platMap; if (isFirst) { - platMap["NVIDIA CUDA"] = "NVIDIA "; - platMap["Intel(R) OpenCL"] = "INTEL "; + platMap["NVIDIA CUDA"] = "NVIDIA "; + platMap["Intel(R) OpenCL"] = "INTEL "; platMap["AMD Accelerated Parallel Processing"] = "AMD "; - platMap["Intel Gen OCL Driver"] = "BEIGNET "; - platMap["Apple"] = "APPLE "; + platMap["Intel Gen OCL Driver"] = "BEIGNET "; + platMap["Apple"] = "APPLE "; + platMap["Portable Computing Language"] = "POCL "; isFirst = false; } @@ -228,38 +322,37 @@ std::string getInfo() << " (OpenCL, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; unsigned nDevices = 0; - for (auto context : DeviceManager::getInstance().mContexts) { - vector devices = context->getInfo(); + for(auto &device: DeviceManager::getInstance().mDevices) { + const Platform platform(device->getInfo()); - for(auto &device:devices) { - const Platform platform(device.getInfo()); + string dstr = device->getInfo(); - string platStr = platform.getInfo(); - string dstr = device.getInfo(); + // Remove null termination character from the strings + dstr.pop_back(); + + bool show_braces = ((unsigned)getActiveDeviceId() == nDevices); - // Remove null termination character from the strings - platStr.pop_back(); - dstr.pop_back(); + string id = + (show_braces ? string("[") : "-") + + std::to_string(nDevices) + + (show_braces ? string("]") : "-"); - bool show_braces = ((unsigned)getActiveDeviceId() == nDevices); - string id = (show_braces ? string("[") : "-") + std::to_string(nDevices) + - (show_braces ? string("]") : "-"); - info << id << " " << platformMap(platStr) << ": " << ltrim(dstr) << " "; + info << id << " " << getPlatformName(*device) << ": " << ltrim(dstr); #ifndef NDEBUG - string devVersion = device.getInfo(); - string driVersion = device.getInfo(); - devVersion.pop_back(); - driVersion.pop_back(); - info << devVersion; - info << " Device driver " << driVersion; - info << " FP64 Support(" - << (device.getInfo()>0 ? "True" : "False") - << ")"; + info << " -- "; + string devVersion = device->getInfo(); + string driVersion = device->getInfo(); + devVersion.pop_back(); + driVersion.pop_back(); + info << devVersion; + info << " -- Device driver " << driVersion; + info << " -- FP64 Support: " + << (device->getInfo()>0 ? "True" : "False") + << ""; #endif - info << std::endl; + info << std::endl; - nDevices++; - } + nDevices++; } return info.str(); } @@ -268,6 +361,8 @@ std::string getPlatformName(const cl::Device &device) { const Platform platform(device.getInfo()); std::string platStr = platform.getInfo(); + // Remove null termination character from the strings + platStr.pop_back(); return platformMap(platStr); } From 8873ed244abaabb3bedf8781a44f096d790fa4b5 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sun, 10 Jan 2016 02:45:28 -0500 Subject: [PATCH 163/288] Using proper offsets for loadImageNative and saveImageNative --- src/api/c/imageio2.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp index adc4244953..aed793e64a 100644 --- a/src/api/c/imageio2.cpp +++ b/src/api/c/imageio2.cpp @@ -61,9 +61,9 @@ static af_err readImage_t(af_array *rImage, const uchar* pSrcLine, const int nSr } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc - pDst0[indx] = (T) *(src + (x * step + 0)); - pDst1[indx] = (T) *(src + (x * step + 1)); - pDst2[indx] = (T) *(src + (x * step + 2)); + pDst0[indx] = (T) *(src + (x * step + FI_RGBA_RED)); + pDst1[indx] = (T) *(src + (x * step + FI_RGBA_GREEN)); + pDst2[indx] = (T) *(src + (x * step + FI_RGBA_BLUE)); } if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + FI_RGBA_ALPHA)); } @@ -239,15 +239,15 @@ static void save_t(T* pDstLine, const af_array in, const dim4 dims, uint nDstPit *(pDstLine + x * step + FI_RGBA_RED) = (T) pSrc0[indx]; // r -> 0 } else if(channels >=3) { if((af_dtype) af::dtype_traits::af_type == u8) { - *(pDstLine + x * step + FI_RGBA_BLUE) = (T) pSrc2[indx]; // b -> 0 + *(pDstLine + x * step + FI_RGBA_RED ) = (T) pSrc0[indx]; // r -> 0 *(pDstLine + x * step + FI_RGBA_GREEN) = (T) pSrc1[indx]; // g -> 1 - *(pDstLine + x * step + FI_RGBA_RED) = (T) pSrc0[indx]; // r -> 2 + *(pDstLine + x * step + FI_RGBA_BLUE ) = (T) pSrc2[indx]; // b -> 2 } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc - *(pDstLine + x * step + 0) = (T) pSrc0[indx]; // r -> 0 - *(pDstLine + x * step + 1) = (T) pSrc1[indx]; // g -> 1 - *(pDstLine + x * step + 2) = (T) pSrc2[indx]; // b -> 2 + *(pDstLine + x * step + FI_RGBA_RED ) = (T) pSrc0[indx]; // r -> 0 + *(pDstLine + x * step + FI_RGBA_GREEN) = (T) pSrc1[indx]; // g -> 1 + *(pDstLine + x * step + FI_RGBA_BLUE ) = (T) pSrc2[indx]; // b -> 2 } } if(channels >= 4) *(pDstLine + x * step + FI_RGBA_ALPHA) = (T) pSrc3[indx]; // a From 14230d21b36ddbc40f59e33011dc8153861ed92a Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sun, 10 Jan 2016 02:46:12 -0500 Subject: [PATCH 164/288] Adding environment variables to choose OpenCL device 1. AF_OPENCL_DEFAULT_DEVICE_TYPE - Can be one of CPU, GPU and ACC - When not set, defaults to first available device - Chooses what the default device should be - Does not disable other devices 2. AF_OPENCL_DEVICE_TYPE - Can be one of CPU, GPU, ACC, ALL - When not set defaults to ALL - Only chooses devices of given type --- src/backend/opencl/platform.cpp | 54 ++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 3bf13c0690..822fdfceb7 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -212,6 +212,18 @@ DeviceManager::DeviceManager() cl_device_type DEVICE_TYPES = CL_DEVICE_TYPE_ALL; #endif + std::string deviceENV = getEnvVar("AF_OPENCL_DEVICE_TYPE"); + + if (deviceENV.compare("GPU") == 0) { + DEVICE_TYPES = CL_DEVICE_TYPE_GPU; + } else if (deviceENV.compare("CPU") == 0) { + DEVICE_TYPES = CL_DEVICE_TYPE_CPU; + } else if (deviceENV.compare("ACC") >= 0) { + DEVICE_TYPES = CL_DEVICE_TYPE_ACCELERATOR; + } + + + // Iterate through platforms, get all available devices and store them for (auto &platform : platforms) { std::vector current_devices; @@ -229,11 +241,14 @@ DeviceManager::DeviceManager() } } + int nDevices = mDevices.size(); + + if (nDevices == 0) AF_ERROR("No OpenCL devices found", AF_ERR_RUNTIME); + // Sort OpenCL devices based on default criteria std::stable_sort(mDevices.begin(), mDevices.end(), compare_default); // Create contexts and queues once the sort is done - int nDevices = mDevices.size(); for (int i = 0; i < nDevices; i++) { cl_platform_id device_platform = mDevices[i]->getInfo(); cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, @@ -247,7 +262,8 @@ DeviceManager::DeviceManager() mIsGLSharingOn.push_back(false); } - std::string deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE"); + bool default_device_set = false; + deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE"); if(!deviceENV.empty()) { std::stringstream s(deviceENV); int def_device = -1; @@ -257,18 +273,48 @@ DeviceManager::DeviceManager() printf("Setting default device as 0\n"); } else { setContext(def_device); + default_device_set = true; } } + + deviceENV = getEnvVar("AF_OPENCL_DEFAULT_DEVICE_TYPE"); + if (!default_device_set && !deviceENV.empty()) + { + cl_device_type default_device_type = CL_DEVICE_TYPE_GPU; + if (deviceENV.compare("CPU") == 0) { + default_device_type = CL_DEVICE_TYPE_CPU; + } else if (deviceENV.compare("ACC") >= 0) { + default_device_type = CL_DEVICE_TYPE_ACCELERATOR; + } + + bool default_device_set = false; + for (int i = 0; i < nDevices; i++) { + if (mDevices[i]->getInfo() == default_device_type) { + default_device_set = true; + setContext(i); + break; + } + } + + if (!default_device_set) { + printf("WARNING: AF_OPENCL_DEFAULT_DEVICE_TYPE=%s is not available\n", + deviceENV.c_str()); + printf("Using default device as 0\n"); + } + } + } catch (const cl::Error &error) { CL_TO_AF_ERROR(error); } - /* loop over devices and replace contexts with - * OpenGL shared contexts whereever applicable */ + + #if defined(WITH_GRAPHICS) // Define AF_DISABLE_GRAPHICS with any value to disable initialization std::string noGraphicsENV = getEnvVar("AF_DISABLE_GRAPHICS"); if(noGraphicsENV.empty()) { // If AF_DISABLE_GRAPHICS is not defined try { + /* loop over devices and replace contexts with + * OpenGL shared contexts whereever applicable */ int devCount = mDevices.size(); fg::Window* wHandle = graphics::ForgeManager::getInstance().getMainWindow(); for(int i=0; i Date: Sun, 10 Jan 2016 03:22:38 -0500 Subject: [PATCH 165/288] Cleaning up exception handling in src/api/c --- src/api/c/assign.cpp | 4 ++-- src/api/c/device.cpp | 18 ++++++++++++------ src/api/c/flip.cpp | 2 +- src/api/c/image.cpp | 2 +- src/api/c/index.cpp | 6 ++---- 5 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/api/c/assign.cpp b/src/api/c/assign.cpp index 50224d32a6..bf2c185a10 100644 --- a/src/api/c/assign.cpp +++ b/src/api/c/assign.cpp @@ -350,10 +350,10 @@ af_err af_assign_gen(af_array *out, throw; } if (is_vector) { AF_CHECK(af_release_array(rhs)); } + + std::swap(*out, output); } CATCHALL; - std::swap(*out, output); - return AF_SUCCESS; } diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 8f332994e7..731e98efec 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -39,7 +39,9 @@ af_err af_get_backend_count(unsigned* num_backends) af_err af_get_available_backends(int* result) { - *result = getBackend(); + try { + *result = getBackend(); + } CATCHALL; return AF_SUCCESS; } @@ -67,7 +69,9 @@ af_err af_init() af_err af_info() { - printf("%s", getInfo().c_str()); + try { + printf("%s", getInfo().c_str()); + } CATCHALL; return AF_SUCCESS; } @@ -326,7 +330,6 @@ af_err af_free_pinned(void *ptr) af_err af_alloc_host(void **ptr, const dim_t bytes) { try { - AF_CHECK(af_init()); *ptr = malloc(bytes); } CATCHALL; return AF_SUCCESS; @@ -335,7 +338,6 @@ af_err af_alloc_host(void **ptr, const dim_t bytes) af_err af_free_host(void *ptr) { try { - AF_CHECK(af_init()); free(ptr); } CATCHALL; return AF_SUCCESS; @@ -376,12 +378,16 @@ af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, af_err af_set_mem_step_size(const size_t step_bytes) { - detail::setMemStepSize(step_bytes); + try{ + detail::setMemStepSize(step_bytes); + } CATCHALL; return AF_SUCCESS; } af_err af_get_mem_step_size(size_t *step_bytes) { - *step_bytes = detail::getMemStepSize(); + try { + *step_bytes = detail::getMemStepSize(); + } CATCHALL; return AF_SUCCESS; } diff --git a/src/api/c/flip.cpp b/src/api/c/flip.cpp index 3d5bf53da8..09cbaf75e4 100644 --- a/src/api/c/flip.cpp +++ b/src/api/c/flip.cpp @@ -74,9 +74,9 @@ af_err af_flip(af_array *result, const af_array in, const unsigned dim) case u8: out = flipArray (in, dim); break; default: TYPE_ERROR(1, in_type); } + swap(*result, out); } CATCHALL - swap(*result, out); return AF_SUCCESS; } diff --git a/src/api/c/image.cpp b/src/api/c/image.cpp index 1d3e0970ba..db40934e50 100644 --- a/src/api/c/image.cpp +++ b/src/api/c/image.cpp @@ -141,9 +141,9 @@ af_err af_create_window(af_window *out, const int width, const int height, const wnd = new fg::Window(width, height, title, mainWnd); wnd->setFont(fgMngr.getFont()); + *out = reinterpret_cast(wnd); } CATCHALL; - *out = reinterpret_cast(wnd); return AF_SUCCESS; #else AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX); diff --git a/src/api/c/index.cpp b/src/api/c/index.cpp index 2f5b06aa07..f5a214f8e5 100644 --- a/src/api/c/index.cpp +++ b/src/api/c/index.cpp @@ -67,10 +67,10 @@ af_err af_index(af_array *result, const af_array in, const unsigned ndims, const case u8: indexArray (out, in, ndims, index); break; default: TYPE_ERROR(1, in_type); } + swap(*result, out); } CATCHALL - swap(*result, out); return AF_SUCCESS; } @@ -127,11 +127,9 @@ af_err af_lookup(af_array *out, const af_array in, const af_array indices, const case u8: output = lookup(in, indices, dim); break; default : TYPE_ERROR(1, idxType); } + std::swap(*out, output); } CATCHALL; - - std::swap(*out, output); - return AF_SUCCESS; } From b42cbebd971bdb5b51e7710d870ff4e644225291 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sun, 10 Jan 2016 03:47:36 -0500 Subject: [PATCH 166/288] Updating docs for new AF_OPENCL_*_TYPE environment variables --- .../configuring_arrayfire_environment.md | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/pages/configuring_arrayfire_environment.md b/docs/pages/configuring_arrayfire_environment.md index 054068e224..7e197e4954 100644 --- a/docs/pages/configuring_arrayfire_environment.md +++ b/docs/pages/configuring_arrayfire_environment.md @@ -44,6 +44,37 @@ AF_OPENCL_DEFAULT_DEVICE=1 ./myprogram_opencl Note: af::setDevice call in the source code will take precedence over this variable. +AF_OPENCL_DEFAULT_DEVICE_TYPE {#af_opencl_default_device_type} +------------------------------------------------------------------------------- + +Use this variable to set the default OpenCL device type. Valid values for this +variable are: CPU, GPU, ACC (Accelerators). + +When set, the first device of the specified type is chosen as default device. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +AF_OPENCL_DEFAULT_DEVICE_TYPE=CPU ./myprogram_opencl +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Note: `AF_OPENCL_DEFAULT_DEVICE` and af::setDevice takes precedence over this variable. + +AF_OPENCL_DEVICE_TYPE {#af_opencl_device_type} +------------------------------------------------------------------------------- + +Use this variable to only choose OpenCL devices of specified type. Valid values for this +variable are: + +- ALL: All OpenCL devices. (Default behavior). +- CPU: CPU devices only. +- GPU: GPU devices only. +- ACC: Accelerator devices only. + +When set, the remaining OpenCL device types are ignored by the OpenCL backend. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +AF_OPENCL_DEVICE_TYPE=CPU ./myprogram_opencl +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + AF_DISABLE_GRAPHICS {#af_disable_graphics} ------------------------------------------------------------------------------- From 17b2600f9ba4e0d5b258213655c65053201c5ad7 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sun, 10 Jan 2016 13:11:21 -0500 Subject: [PATCH 167/288] Freeimage only requires the flags for 24 / 32 bit images --- src/api/c/imageio.cpp | 15 ++++++++------- src/api/c/imageio2.cpp | 18 ++++++++++-------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp index c6a20a85a2..e372cd7e64 100644 --- a/src/api/c/imageio.cpp +++ b/src/api/c/imageio.cpp @@ -60,14 +60,15 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP pDst0[indx] = (float) *(src + (x * step + FI_RGBA_RED)); pDst1[indx] = (float) *(src + (x * step + FI_RGBA_GREEN)); pDst2[indx] = (float) *(src + (x * step + FI_RGBA_BLUE)); + if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + FI_RGBA_ALPHA)); } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc - pDst0[indx] = (float) *(src + (x * step + FI_RGBA_RED)); - pDst1[indx] = (float) *(src + (x * step + FI_RGBA_GREEN)); - pDst2[indx] = (float) *(src + (x * step + FI_RGBA_BLUE)); + pDst0[indx] = (float) *(src + (x * step + 0)); + pDst1[indx] = (float) *(src + (x * step + 1)); + pDst2[indx] = (float) *(src + (x * step + 2)); + if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + 3)); } - if (fo_color == 4) pDst3[indx] = (float) *(src + (x * step + FI_RGBA_ALPHA)); } indx++; } @@ -104,9 +105,9 @@ static af_err readImage(af_array *rImage, const uchar* pSrcLine, const int nSrcP } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc - r = (T) *(src + (x * step + FI_RGBA_RED)); - g = (T) *(src + (x * step + FI_RGBA_GREEN)); - b = (T) *(src + (x * step + FI_RGBA_BLUE)); + r = (T) *(src + (x * step + 0)); + g = (T) *(src + (x * step + 1)); + b = (T) *(src + (x * step + 2)); } pDst[indx] = r * 0.2989f + g * 0.5870f + b * 0.1140f; } diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp index aed793e64a..a1374a2944 100644 --- a/src/api/c/imageio2.cpp +++ b/src/api/c/imageio2.cpp @@ -58,14 +58,15 @@ static af_err readImage_t(af_array *rImage, const uchar* pSrcLine, const int nSr pDst0[indx] = (T) *(src + (x * step + FI_RGBA_RED)); pDst1[indx] = (T) *(src + (x * step + FI_RGBA_GREEN)); pDst2[indx] = (T) *(src + (x * step + FI_RGBA_BLUE)); + if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + FI_RGBA_ALPHA)); } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc - pDst0[indx] = (T) *(src + (x * step + FI_RGBA_RED)); - pDst1[indx] = (T) *(src + (x * step + FI_RGBA_GREEN)); - pDst2[indx] = (T) *(src + (x * step + FI_RGBA_BLUE)); + pDst0[indx] = (T) *(src + (x * step + 0)); + pDst1[indx] = (T) *(src + (x * step + 1)); + pDst2[indx] = (T) *(src + (x * step + 2)); + if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + 3)); } - if (fi_color == 4) pDst3[indx] = (T) *(src + (x * step + FI_RGBA_ALPHA)); } indx++; } @@ -242,15 +243,16 @@ static void save_t(T* pDstLine, const af_array in, const dim4 dims, uint nDstPit *(pDstLine + x * step + FI_RGBA_RED ) = (T) pSrc0[indx]; // r -> 0 *(pDstLine + x * step + FI_RGBA_GREEN) = (T) pSrc1[indx]; // g -> 1 *(pDstLine + x * step + FI_RGBA_BLUE ) = (T) pSrc2[indx]; // b -> 2 + if(channels >= 4) *(pDstLine + x * step + FI_RGBA_ALPHA) = (T) pSrc3[indx]; // a } else { // Non 8-bit types do not use ordering // See Pixel Access Functions Chapter in FreeImage Doc - *(pDstLine + x * step + FI_RGBA_RED ) = (T) pSrc0[indx]; // r -> 0 - *(pDstLine + x * step + FI_RGBA_GREEN) = (T) pSrc1[indx]; // g -> 1 - *(pDstLine + x * step + FI_RGBA_BLUE ) = (T) pSrc2[indx]; // b -> 2 + *(pDstLine + x * step + 0) = (T) pSrc0[indx]; // r -> 0 + *(pDstLine + x * step + 1) = (T) pSrc1[indx]; // g -> 1 + *(pDstLine + x * step + 2) = (T) pSrc2[indx]; // b -> 2 + if(channels >= 4) *(pDstLine + x * step + 3) = (T) pSrc3[indx]; // a } } - if(channels >= 4) *(pDstLine + x * step + FI_RGBA_ALPHA) = (T) pSrc3[indx]; // a ++indx; } pDstLine = (T*)(((uchar*)pDstLine) - nDstPitch); From 777abcb786cce8be378ed2207f83525350a28cff Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 10:27:09 -0500 Subject: [PATCH 168/288] Moving dispatch.hpp / dispatch.cpp to src/backend/ --- src/{api/c => backend}/dispatch.cpp | 0 src/{api/c => backend}/dispatch.hpp | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename src/{api/c => backend}/dispatch.cpp (100%) rename src/{api/c => backend}/dispatch.hpp (100%) diff --git a/src/api/c/dispatch.cpp b/src/backend/dispatch.cpp similarity index 100% rename from src/api/c/dispatch.cpp rename to src/backend/dispatch.cpp diff --git a/src/api/c/dispatch.hpp b/src/backend/dispatch.hpp similarity index 100% rename from src/api/c/dispatch.hpp rename to src/backend/dispatch.hpp From 828138c60b1a3a05536650fb59a7b53d62fbc43c Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 10:39:12 -0500 Subject: [PATCH 169/288] Renaming a few internal functions - memPush --> memLock - memPop --> memUnlock --- src/api/c/device.cpp | 4 ++-- src/backend/cpu/Array.hpp | 2 +- src/backend/cpu/memory.cpp | 24 ++++++++++++------------ src/backend/cpu/memory.hpp | 6 +++--- src/backend/cuda/Array.hpp | 2 +- src/backend/cuda/memory.cpp | 30 +++++++++++++++--------------- src/backend/cuda/memory.hpp | 6 +++--- src/backend/opencl/Array.hpp | 2 +- src/backend/opencl/memory.cpp | 28 ++++++++++++++-------------- src/backend/opencl/memory.hpp | 8 ++++---- 10 files changed, 56 insertions(+), 56 deletions(-) diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 731e98efec..c37e2934ae 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -221,7 +221,7 @@ af_err af_get_device_ptr(void **data, const af_array arr) template inline void lockArray(const af_array arr) { - memPop((const T *)getArray(arr).get()); + memLock((const T *)getArray(arr).get()); } af_err af_lock_device_ptr(const af_array arr) @@ -258,7 +258,7 @@ af_err af_lock_array(const af_array arr) template inline void unlockArray(const af_array arr) { - memPush((const T *)getArray(arr).get()); + memUnlock((const T *)getArray(arr).get()); } af_err af_unlock_device_ptr(const af_array arr) diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index e0709d36d3..9cd154ec50 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -84,7 +84,7 @@ namespace cpu void *getDevicePtr(const Array& arr) { T *ptr = arr.device(); - memPop(ptr); + memLock(ptr); return (void *)ptr; } diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 625f9b2416..5eebf18a43 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -196,7 +196,7 @@ T* memAlloc(const size_t &elements) } template -void memFreeLocked(T *ptr, bool freeLocked) +void memFreeLocked(T *ptr, bool user_unlock) { std::lock_guard lock(memory_map_mutex); @@ -205,7 +205,7 @@ void memFreeLocked(T *ptr, bool freeLocked) if (iter != memory_map.end()) { iter->second.mngr_lock = false; - if ((iter->second).user_lock && !freeLocked) return; + if ((iter->second).user_lock && !user_unlock) return; iter->second.user_lock = false; used_bytes -= iter->second.bytes; @@ -223,7 +223,7 @@ void memFree(T *ptr) } template -void memPop(const T *ptr) +void memLock(const T *ptr) { std::lock_guard lock(memory_map_mutex); @@ -241,7 +241,7 @@ void memPop(const T *ptr) } template -void memPush(const T *ptr) +void memUnlock(const T *ptr) { std::lock_guard lock(memory_map_mutex); mem_iter iter = memory_map.find((void *)ptr); @@ -273,14 +273,14 @@ void pinnedFree(T* ptr) memFree(ptr); } -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memFreeLocked(T* ptr, bool freeLocked); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template void memFreeLocked(T* ptr, bool user_unlock); \ + template void memLock(const T* ptr); \ + template void memUnlock(const T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ INSTANTIATE(float) INSTANTIATE(cfloat) diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 19846c46bf..6524fe6f94 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -17,10 +17,10 @@ namespace cpu // This is because it is used as the deleter in shared pointer // which cannot support default arguments template void memFree(T* ptr); - template void memFreeLocked(T* ptr, bool freeLocked); + template void memFreeLocked(T* ptr, bool user_unlock); - template void memPop(const T *ptr); - template void memPush(const T *ptr); + template void memLock(const T *ptr); + template void memUnlock(const T *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index 638b745d09..ad4396b48c 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -86,7 +86,7 @@ namespace cuda void *getDevicePtr(const Array& arr) { T *ptr = arr.device(); - memPop(ptr); + memLock(ptr); return (void *)ptr; } diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 8152c8a25d..f37a0fe19a 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -68,19 +68,19 @@ namespace cuda } template - void memFreeLocked(T *ptr, bool freeLocked) + void memFreeLocked(T *ptr, bool user_unlock) { cudaFreeWrapper(ptr); // Free it because we are not sure what the size is } template - void memPop(const T *ptr) + void memLock(const T *ptr) { return; } template - void memPush(const T *ptr) + void memUnlock(const T *ptr) { return; } @@ -283,7 +283,7 @@ namespace cuda } template - void memFreeLocked(T *ptr, bool freeLocked) + void memFreeLocked(T *ptr, bool user_unlock) { int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find((void *)ptr); @@ -291,7 +291,7 @@ namespace cuda if (iter != memory_maps[n].end()) { iter->second.mngr_lock = false; - if ((iter->second.user_lock) && !freeLocked) return; + if ((iter->second.user_lock) && !user_unlock) return; iter->second.user_lock = false; @@ -310,7 +310,7 @@ namespace cuda } template - void memPop(const T *ptr) + void memLock(const T *ptr) { int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find((void *)ptr); @@ -328,7 +328,7 @@ namespace cuda } template - void memPush(const T *ptr) + void memUnlock(const T *ptr) { int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find((void *)ptr); @@ -427,14 +427,14 @@ namespace cuda #endif -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memFreeLocked(T* ptr, bool freeLocked); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template void memFreeLocked(T* ptr, bool user_unlock); \ + template void memLock(const T* ptr); \ + template void memUnlock(const T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ INSTANTIATE(float) INSTANTIATE(cfloat) diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index 5644a52371..29e4e76597 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -17,9 +17,9 @@ namespace cuda // This is because it is used as the deleter in shared pointer // which cannot support default arguments template void memFree(T* ptr); - template void memFreeLocked(T* ptr, bool freeLocked); - template void memPop(const T *ptr); - template void memPush(const T *ptr); + template void memFreeLocked(T* ptr, bool user_unlock); + template void memLock(const T *ptr); + template void memUnlock(const T *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index 5f86d6d0b6..a6d3f4f869 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -77,7 +77,7 @@ namespace opencl void *getDevicePtr(const Array& arr) { cl::Buffer *buf = arr.device(); - memPop((T *)buf); + memLock((T *)buf); return (void *)((*buf)()); } diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 141610d71f..b75955efd9 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -194,7 +194,7 @@ namespace opencl bufferFreeLocked(ptr, false); } - void bufferFreeLocked(cl::Buffer *ptr, bool freeLocked) + void bufferFreeLocked(cl::Buffer *ptr, bool user_unlock) { int n = getActiveDeviceId(); mem_iter iter = memory_maps[n].find(ptr); @@ -202,7 +202,7 @@ namespace opencl if (iter != memory_maps[n].end()) { iter->second.mngr_lock = false; - if ((iter->second).user_lock && !freeLocked) return; + if ((iter->second).user_lock && !user_unlock) return; iter->second.user_lock = false; @@ -264,19 +264,19 @@ namespace opencl } template - void memFreeLocked(T *ptr, bool freeLocked) + void memFreeLocked(T *ptr, bool user_unlock) { - return bufferFreeLocked((cl::Buffer *)ptr, freeLocked); + return bufferFreeLocked((cl::Buffer *)ptr, user_unlock); } template - void memPop(const T *ptr) + void memLock(const T *ptr) { return bufferPop((cl::Buffer *)ptr); } template - void memPush(const T *ptr) + void memUnlock(const T *ptr) { return bufferPush((cl::Buffer *)ptr); } @@ -395,14 +395,14 @@ namespace opencl return pinnedBufferFree((void *) ptr); } -#define INSTANTIATE(T) \ - template T* memAlloc(const size_t &elements); \ - template void memFree(T* ptr); \ - template void memFreeLocked(T* ptr, bool freeLocked); \ - template void memPop(const T* ptr); \ - template void memPush(const T* ptr); \ - template T* pinnedAlloc(const size_t &elements); \ - template void pinnedFree(T* ptr); \ +#define INSTANTIATE(T) \ + template T* memAlloc(const size_t &elements); \ + template void memFree(T* ptr); \ + template void memFreeLocked(T* ptr, bool user_unlock); \ + template void memLock(const T* ptr); \ + template void memUnlock(const T* ptr); \ + template T* pinnedAlloc(const size_t &elements); \ + template void pinnedFree(T* ptr); \ INSTANTIATE(float) INSTANTIATE(cfloat) diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index 96292cdfac..dce142805a 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -20,7 +20,7 @@ namespace opencl // This is because it is used as the deleter in shared pointer // which cannot support default arguments void bufferFree(cl::Buffer *buf); - void bufferFreeLocked(cl::Buffer *buf, bool freeLocked); + void bufferFreeLocked(cl::Buffer *buf, bool user_unlock); template T *memAlloc(const size_t &elements); @@ -28,9 +28,9 @@ namespace opencl // This is because it is used as the deleter in shared pointer // which cannot support default arguments template void memFree(T* ptr); - template void memFreeLocked(T* ptr, bool freeLocked); - template void memPop(const T *ptr); - template void memPush(const T *ptr); + template void memFreeLocked(T* ptr, bool user_unlock); + template void memLock(const T *ptr); + template void memUnlock(const T *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); From c8cd29b1267580a851da82ac3fcb2c2762119f3a Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 12:47:11 -0500 Subject: [PATCH 170/288] Adding a unified memory manager for all backends --- src/backend/MemoryManager.cpp | 250 +++++++++++++++++ src/backend/MemoryManager.hpp | 99 +++++++ src/backend/cpu/memory.cpp | 247 ++++------------- src/backend/cpu/memory.hpp | 1 + src/backend/cuda/memory.cpp | 506 +++++++++------------------------- src/backend/cuda/memory.hpp | 1 + src/backend/opencl/memory.cpp | 499 +++++++++++---------------------- src/backend/opencl/memory.hpp | 5 - 8 files changed, 710 insertions(+), 898 deletions(-) create mode 100644 src/backend/MemoryManager.cpp create mode 100644 src/backend/MemoryManager.hpp diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp new file mode 100644 index 0000000000..621ce624e7 --- /dev/null +++ b/src/backend/MemoryManager.cpp @@ -0,0 +1,250 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include "MemoryManager.hpp" +#include "dispatch.hpp" +#include "err_common.hpp" +#include "util.hpp" + +namespace common +{ + +MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, unsigned MAX_BYTES, bool debug): + mem_step_size(1024), + max_buffers(MAX_BUFFERS), + max_bytes(MAX_BYTES), + memory(num_devices), + debug_mode(debug) +{ + std::string env_var = getEnvVar("AF_MEM_DEBUG"); + if (!env_var.empty()) { + this->debug_mode = env_var[0] != '0'; + } + if (this->debug_mode) mem_step_size = 1; +} + +void MemoryManager::garbageCollect() +{ + if (this->debug_mode) return; + + memory_info& current = this->getCurrentMemoryInfo(); + + for(buffer_iter iter = current.map.begin(); + iter != current.map.end(); ++iter) { + + if (!(iter->second).manager_lock) { + + if (!(iter->second).user_lock) { + if ((iter->second).bytes > 0) { + this->nativeFree(iter->first); + } + current.total_bytes -= iter->second.bytes; + } + } + } + + buffer_iter memory_curr = current.map.begin(); + buffer_iter memory_end = current.map.end(); + + while(memory_curr != memory_end) { + if (memory_curr->second.manager_lock || memory_curr->second.user_lock) { + ++memory_curr; + } else { + current.map.erase(memory_curr++); + } + } +} + +void MemoryManager::unlock(void *ptr, bool user_unlock) +{ + memory_info& current = this->getCurrentMemoryInfo(); + lock_guard_t lock(this->memory_mutex); + + buffer_iter iter = current.map.find((void *)ptr); + + if (iter != current.map.end()) { + + iter->second.manager_lock = false; + if ((iter->second).user_lock && !user_unlock) return; + + iter->second.user_lock = false; + current.lock_bytes -= iter->second.bytes; + current.lock_buffers--; + + if (this->debug_mode) { + if ((iter->second).bytes > 0) { + this->nativeFree(iter->first); + } + } + + } else { + this->nativeFree(ptr); // Free it because we are not sure what the size is + } +} + +void *MemoryManager::alloc(const size_t bytes) +{ + memory_info& current = this->getCurrentMemoryInfo(); + + void *ptr = NULL; + size_t alloc_bytes = this->debug_mode ? bytes : (divup(bytes, mem_step_size) * mem_step_size); + + if (bytes > 0) { + + lock_guard_t lock(this->memory_mutex); + + // There is no memory cache in debug mode + if (!this->debug_mode) { + + // FIXME: Add better checks for garbage collection + // Perhaps look at total memory available as a metric + if (current.map.size() > this->max_buffers || + current.lock_bytes >= this->max_bytes) { + + this->garbageCollect(); + } + + for(buffer_iter iter = current.map.begin(); + iter != current.map.end(); ++iter) { + + buffer_info info = iter->second; + + if (!info.manager_lock && + !info.user_lock && + info.bytes == alloc_bytes) { + + iter->second.manager_lock = true; + current.lock_bytes += alloc_bytes; + current.lock_buffers++; + return iter->first; + } + } + } + + // Perform garbage collection if memory can not be allocated + ptr = this->nativeAlloc(alloc_bytes); + + if (!ptr) { + this->garbageCollect(); + ptr = this->nativeAlloc(alloc_bytes); + if (!ptr) AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM); + } + + buffer_info info = {true, false, alloc_bytes}; + current.map[ptr] = info; + + current.lock_bytes += alloc_bytes; + current.lock_buffers++; + current.total_bytes += alloc_bytes; + } + return ptr; +} + +void MemoryManager::userLock(const void *ptr) +{ + memory_info& current = this->getCurrentMemoryInfo(); + + lock_guard_t lock(this->memory_mutex); + + buffer_iter iter = current.map.find(const_cast(ptr)); + + if (iter != current.map.end()) { + iter->second.user_lock = true; + } else { + buffer_info info = { true, + true, + 100 }; //This number is not relevant + + current.map[(void *)ptr] = info; + } +} + +void MemoryManager::userUnlock(const void *ptr) +{ + memory_info& current = this->getCurrentMemoryInfo(); + + lock_guard_t lock(this->memory_mutex); + + buffer_iter iter = current.map.find((void *)ptr); + if (iter != current.map.end()) { + iter->second.user_lock = false; + if (this->debug_mode) { + if ((iter->second).bytes > 0) { + this->nativeFree(iter->first); + } + } + } +} + +size_t MemoryManager::getMemStepSize() +{ + lock_guard_t lock(this->memory_mutex); + return this->mem_step_size; +} + +void MemoryManager::setMemStepSize(size_t new_step_size) +{ + lock_guard_t lock(this->memory_mutex); + this->mem_step_size = new_step_size; +} + +void MemoryManager::printInfo(const char *msg, const int device) +{ + lock_guard_t lock(this->memory_mutex); + memory_info& current = this->getCurrentMemoryInfo(); + + std::cout << msg << std::endl; + + static const std::string head("| POINTER | SIZE | AF LOCK | USER LOCK |"); + static const std::string line(head.size(), '-'); + std::cout << line << std::endl << head << std::endl << line << std::endl; + + for(buffer_iter iter = current.map.begin(); + iter != current.map.end(); ++iter) { + + std::string status_mngr("Unknown"); + std::string status_user("Unknown"); + + if(iter->second.manager_lock) status_mngr = "Yes"; + else status_mngr = " No"; + + if(iter->second.user_lock) status_user = "Yes"; + else status_user = " No"; + + std::string unit = "KB"; + double size = (double)(iter->second.bytes) / 1024; + if(size >= 1024) { + size = size / 1024; + unit = "MB"; + } + + std::cout << "| " << std::right << std::setw(14) << iter->first << " " + << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit + << " | " << std::setw(9) << status_mngr + << " | " << std::setw(9) << status_user + << " |" << std::endl; + } + + std::cout << line << std::endl; +} + +void MemoryManager::bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + memory_info current = this->getCurrentMemoryInfo(); + lock_guard_t lock(this->memory_mutex); + if (alloc_bytes ) *alloc_bytes = current.total_bytes; + if (alloc_buffers ) *alloc_buffers = current.map.size(); + if (lock_bytes ) *lock_bytes = current.lock_bytes; + if (lock_buffers ) *lock_buffers = current.lock_buffers; +} +} diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp new file mode 100644 index 0000000000..1f87ea2dfe --- /dev/null +++ b/src/backend/MemoryManager.hpp @@ -0,0 +1,99 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once + +#include +#include +#include + +namespace common +{ + +typedef std::mutex mutex_t; +typedef std::lock_guard lock_guard_t; + +class MemoryManager +{ + typedef struct + { + bool manager_lock; + bool user_lock; + size_t bytes; + } buffer_info; + + typedef std::map buffer_t; + typedef buffer_t::iterator buffer_iter; + + typedef struct + { + buffer_t map; + size_t lock_bytes; + size_t lock_buffers; + size_t total_bytes; + } memory_info; + + size_t mem_step_size; + unsigned max_buffers; + unsigned max_bytes; + std::vector memory; + bool debug_mode; + + memory_info& getCurrentMemoryInfo() + { + return memory[this->getActiveDeviceId()]; + } + + virtual int getActiveDeviceId() + { + return 0; + } + +public: + MemoryManager(int num_devices, unsigned MAX_BUFFERS, unsigned MAX_BYTES, bool debug); + + void *alloc(const size_t bytes); + + void unlock(void *ptr, bool user_unlock); + + void garbageCollect(); + + void printInfo(const char *msg, const int device); + + void bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers); + + void userLock(const void *ptr); + + void userUnlock(const void *ptr); + + size_t getMemStepSize(); + + void setMemStepSize(size_t new_step_size); + + virtual void *nativeAlloc(const size_t bytes) + { + return malloc(bytes); + } + + virtual void nativeFree(void *ptr) + { + return free((void *)ptr); + } + + virtual ~MemoryManager() + { + } + +protected: + mutex_t memory_mutex; + +}; + +} diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 5eebf18a43..2687b3018b 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -10,244 +10,111 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include #include #include +#include +#include -namespace cpu -{ +#ifndef AF_MEM_DEBUG +#define AF_MEM_DEBUG 0 +#endif -static size_t memory_resolution = 1024; //1KB +#ifndef AF_CPU_MEM_DEBUG +#define AF_CPU_MEM_DEBUG 0 +#endif -void setMemStepSize(size_t step_bytes) -{ - memory_resolution = step_bytes; -} - -size_t getMemStepSize(void) +namespace cpu { - return memory_resolution; -} -class Manager +class MemoryManager : public common::MemoryManager { - public: - static bool initialized; - Manager() + int getActiveDeviceId(); +public: + MemoryManager(); + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); + ~MemoryManager() { - initialized = true; - } - - ~Manager() - { - garbageCollect(); + common::lock_guard_t lock(this->memory_mutex); + this->garbageCollect(); } }; -bool Manager::initialized = false; - -static void managerInit() +int MemoryManager::getActiveDeviceId() { - if(Manager::initialized == false) - static Manager pm = Manager(); + return cpu::getActiveDeviceId(); } -typedef struct -{ - bool mngr_lock; // True if locked by memory manager, false if free - bool user_lock; // True if locked by user, false if free - size_t bytes; -} mem_info; +MemoryManager::MemoryManager() : + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_CPU_MEM_DEBUG) +{} -static size_t used_bytes = 0; -static size_t used_buffers = 0; -static size_t total_bytes = 0; -typedef std::map mem_t; -typedef mem_t::iterator mem_iter; -mem_t memory_map; -std::mutex memory_map_mutex; +void *MemoryManager::nativeAlloc(const size_t bytes) +{ + return malloc(bytes); +} -template -void freeWrapper(T *ptr) +void MemoryManager::nativeFree(void *ptr) { - free((void *)ptr); + return free((void *)ptr); } -void garbageCollect() +static MemoryManager &getMemoryManager() { - for(mem_iter iter = memory_map.begin(); - iter != memory_map.end(); ++iter) { + static MemoryManager instance; + return instance; +} - if (!(iter->second).mngr_lock) { +void setMemStepSize(size_t step_bytes) +{ + getMemoryManager().setMemStepSize(step_bytes); +} - if (!(iter->second).user_lock) { - freeWrapper(iter->first); - total_bytes -= iter->second.bytes; - } - } - } +size_t getMemStepSize(void) +{ + return getMemoryManager().getMemStepSize(); +} - mem_iter memory_curr = memory_map.begin(); - mem_iter memory_end = memory_map.end(); - while(memory_curr != memory_end) { - if (memory_curr->second.mngr_lock || memory_curr->second.user_lock) { - ++memory_curr; - } else { - memory_map.erase(memory_curr++); - } - } +void garbageCollect() +{ + getMemoryManager().garbageCollect(); } void printMemInfo(const char *msg, const int device) { - std::cout << msg << std::endl; - - static const std::string head("| POINTER | SIZE | AF LOCK | USER LOCK |"); - static const std::string line(head.size(), '-'); - std::cout << line << std::endl << head << std::endl << line << std::endl; - - for(mem_iter iter = memory_map.begin(); - iter != memory_map.end(); ++iter) { - - std::string status_mngr("Unknown"); - std::string status_user("Unknown"); - - if(iter->second.mngr_lock) status_mngr = "Yes"; - else status_mngr = " No"; - - if(iter->second.user_lock) status_user = "Yes"; - else status_user = " No"; - - std::string unit = "KB"; - double size = (double)(iter->second.bytes) / 1024; - if(size >= 1024) { - size = size / 1024; - unit = "MB"; - } - - std::cout << "| " << std::right << std::setw(14) << iter->first << " " - << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit - << " | " << std::setw(9) << status_mngr - << " | " << std::setw(9) << status_user - << " |" << std::endl; - } - - std::cout << line << std::endl; + getMemoryManager().printInfo(msg, device); } template T* memAlloc(const size_t &elements) { - managerInit(); - - T* ptr = NULL; - size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution; - - if (elements > 0) { - std::lock_guard lock(memory_map_mutex); - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (memory_map.size() > MAX_BUFFERS || - used_bytes >= MAX_BYTES) { - - garbageCollect(); - } - - for(mem_iter iter = memory_map.begin(); - iter != memory_map.end(); ++iter) { - - mem_info info = iter->second; - - if (!info.mngr_lock && - !info.user_lock && - info.bytes == alloc_bytes) { - - iter->second.mngr_lock = true; - used_bytes += alloc_bytes; - used_buffers++; - return (T *)iter->first; - } - } - - // Perform garbage collection if memory can not be allocated - ptr = (T *)malloc(alloc_bytes); - - if (ptr == NULL) { - AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM); - } - - mem_info info = {true, false, alloc_bytes}; - memory_map[ptr] = info; - - used_bytes += alloc_bytes; - used_buffers++; - total_bytes += alloc_bytes; - } - return ptr; + return (T *)getMemoryManager().alloc(elements * sizeof(T)); } template -void memFreeLocked(T *ptr, bool user_unlock) +void memFree(T *ptr) { - std::lock_guard lock(memory_map_mutex); - - mem_iter iter = memory_map.find((void *)ptr); - - if (iter != memory_map.end()) { - - iter->second.mngr_lock = false; - if ((iter->second).user_lock && !user_unlock) return; - - iter->second.user_lock = false; - used_bytes -= iter->second.bytes; - used_buffers--; - - } else { - freeWrapper(ptr); // Free it because we are not sure what the size is - } + return getMemoryManager().unlock((void *)ptr, false); } template -void memFree(T *ptr) +void memFreeLocked(T *ptr, bool user_unlock) { - memFreeLocked(ptr, false); + return getMemoryManager().unlock((void *)ptr, user_unlock); } template void memLock(const T *ptr) { - std::lock_guard lock(memory_map_mutex); - - mem_iter iter = memory_map.find((void *)ptr); - - if (iter != memory_map.end()) { - iter->second.user_lock = true; - } else { - mem_info info = { true, - true, - 100 }; //This number is not relevant - - memory_map[(void *)ptr] = info; - } + getMemoryManager().userLock((void *)ptr); } template void memUnlock(const T *ptr) { - std::lock_guard lock(memory_map_mutex); - mem_iter iter = memory_map.find((void *)ptr); - if (iter != memory_map.end()) { - iter->second.user_lock = false; - } + getMemoryManager().userUnlock((void *)ptr); } @@ -255,22 +122,20 @@ void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers) { getQueue().sync(); - if (alloc_bytes ) *alloc_bytes = total_bytes; - if (alloc_buffers ) *alloc_buffers = memory_map.size(); - if (lock_bytes ) *lock_bytes = used_bytes; - if (lock_buffers ) *lock_buffers = used_buffers; + getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers, + lock_bytes, lock_buffers); } template T* pinnedAlloc(const size_t &elements) { - return memAlloc(elements); + return (T *)getMemoryManager().alloc(elements * sizeof(T)); } template void pinnedFree(T* ptr) { - memFree(ptr); + return getMemoryManager().unlock((void *)ptr, false); } #define INSTANTIATE(T) \ diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 6524fe6f94..279b3dbd28 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -9,6 +9,7 @@ #pragma once #include + namespace cpu { template T* memAlloc(const size_t &elements); diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index f37a0fe19a..43c37e016f 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -20,412 +20,178 @@ #include #include #include +#include -namespace cuda -{ - static size_t memory_resolution = 1024; //1KB - - void setMemStepSize(size_t step_bytes) - { - memory_resolution = step_bytes; - } - - size_t getMemStepSize(void) - { - return memory_resolution; - } - - template - static void cudaFreeWrapper(T *ptr) - { - cudaError_t err = cudaFree(ptr); - if (err != cudaErrorCudartUnloading) // see issue #167 - CUDA_CHECK(err); - } - - template - static void pinnedFreeWrapper(T *ptr) - { - cudaError_t err = cudaFreeHost(ptr); - if (err != cudaErrorCudartUnloading) // see issue #167 - CUDA_CHECK(err); - } - -#ifdef AF_CUDA_MEM_DEBUG - - template - T* memAlloc(const size_t &elements) - { - T* ptr = NULL; - CUDA_CHECK(cudaMalloc(&ptr, elements * sizeof(T))); - return ptr; - } - - template - void memFree(T *ptr) - { - cudaFreeWrapper(ptr); // Free it because we are not sure what the size is - } - - template - void memFreeLocked(T *ptr, bool user_unlock) - { - cudaFreeWrapper(ptr); // Free it because we are not sure what the size is - } - - template - void memLock(const T *ptr) - { - return; - } - - template - void memUnlock(const T *ptr) - { - return; - } - - template - T* pinnedAlloc(const size_t &elements) - { - T* ptr = NULL; - CUDA_CHECK(cudaMallocHost((void **)(&ptr), elements * sizeof(T))); - return (T*)ptr; - } - template - void pinnedFree(T *ptr) - { - pinnedFreeWrapper(ptr); // Free it because we are not sure what the size is - } +#ifndef AF_MEM_DEBUG +#define AF_MEM_DEBUG 0 +#endif - void garbageCollect() - { - } +#ifndef AF_CUDA_MEM_DEBUG +#define AF_CUDA_MEM_DEBUG 0 +#endif - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) - { - } +namespace cuda +{ - void printMemInfo(const char *msg, const int device) +class MemoryManager : public common::MemoryManager +{ + int getActiveDeviceId(); +public: + MemoryManager(); + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); + ~MemoryManager() { - std::cout << "printMemInfo() disabled in AF_CUDA_MEM_DEBUG Mode" << std::endl; + common::lock_guard_t lock(this->memory_mutex); + this->garbageCollect(); } -#else +}; - // Manager Class - // Dummy used to call garbage collection at the end of the program - class Manager - { - public: - static bool initialized; - Manager() - { - initialized = true; - } - - ~Manager() - { - // Destructors should not through exceptions - try { - for(int i = 0; i < getDeviceCount(); i++) { - setDevice(i); - garbageCollect(); - } - pinnedGarbageCollect(); - - } catch (AfError &ex) { - - std::string perr = getEnvVar("AF_PRINT_ERRORS"); - if(!perr.empty()) { - if(perr != "0") - fprintf(stderr, "%s\n", ex.what()); - } - } - } - }; - - bool Manager::initialized = false; - - static void managerInit() +class MemoryManagerPinned : public common::MemoryManager +{ + int getActiveDeviceId(); +public: + MemoryManagerPinned(); + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); + ~MemoryManagerPinned() { - if(Manager::initialized == false) - static Manager pm = Manager(); + common::lock_guard_t lock(this->memory_mutex); + this->garbageCollect(); } +}; - typedef struct - { - bool mngr_lock; - bool user_lock; - size_t bytes; - } mem_info; +int MemoryManager::getActiveDeviceId() +{ + return cuda::getActiveDeviceId(); +} - static size_t used_bytes[DeviceManager::MAX_DEVICES] = {0}; - static size_t used_buffers[DeviceManager::MAX_DEVICES] = {0}; - static size_t total_bytes[DeviceManager::MAX_DEVICES] = {0}; - typedef std::map mem_t; - typedef mem_t::iterator mem_iter; +MemoryManager::MemoryManager() : + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) +{} - mem_t memory_maps[DeviceManager::MAX_DEVICES]; +void *MemoryManager::nativeAlloc(const size_t bytes) +{ + void *ptr = NULL; + CUDA_CHECK(cudaMalloc(&ptr, bytes)); + return ptr; +} - void garbageCollect() - { - int n = getActiveDeviceId(); - - for(mem_iter iter = memory_maps[n].begin(); - iter != memory_maps[n].end(); ++iter) { - - if (!(iter->second.mngr_lock)) { - - if (!(iter->second.user_lock)) { - cudaFreeWrapper(iter->first); - total_bytes[n] -= iter->second.bytes; - } - } - } - - mem_iter memory_curr = memory_maps[n].begin(); - mem_iter memory_end = memory_maps[n].end(); - - while(memory_curr != memory_end) { - if (memory_curr->second.mngr_lock || memory_curr->second.user_lock) { - ++memory_curr; - } else { - memory_maps[n].erase(memory_curr++); - } - } +void MemoryManager::nativeFree(void *ptr) +{ + cudaError_t err = cudaFree(ptr); + if (err != cudaErrorCudartUnloading) { + CUDA_CHECK(err); } +} - void printMemInfo(const char *msg, const int device) - { - std::cout << msg << std::endl; - std::cout << "Memory Map for Device: " << device << std::endl; - - static const std::string head("| POINTER | SIZE | AF LOCK | USER LOCK |"); - static const std::string line(head.size(), '-'); - std::cout << line << std::endl << head << std::endl << line << std::endl; - - for(mem_iter iter = memory_maps[device].begin(); - iter != memory_maps[device].end(); ++iter) { - - std::string status_mngr("Unknown"); - std::string status_user("Unknown"); - - if(iter->second.mngr_lock) status_mngr = "Yes"; - else status_mngr = " No"; - - if(iter->second.user_lock) status_user = "Yes"; - else status_user = " No"; +static MemoryManager &getMemoryManager() +{ + static MemoryManager instance; + return instance; +} - std::string unit = "KB"; - double size = (double)(iter->second.bytes) / 1024; - if(size >= 1024) { - size = size / 1024; - unit = "MB"; - } +int MemoryManagerPinned::getActiveDeviceId() +{ + return cuda::getActiveDeviceId(); +} - std::cout << "| " << std::right << std::setw(14) << iter->first << " " - << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit - << " | " << std::setw(9) << status_mngr - << " | " << std::setw(9) << status_user - << " |" << std::endl; - } +MemoryManagerPinned::MemoryManagerPinned() : + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) +{} - std::cout << line << std::endl; - } +void *MemoryManagerPinned::nativeAlloc(const size_t bytes) +{ + void *ptr; + CUDA_CHECK(cudaMallocHost(&ptr, bytes)); + return ptr; +} - template - T* memAlloc(const size_t &elements) - { - managerInit(); - int n = getActiveDeviceId(); - T* ptr = NULL; - size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution; - - if (elements > 0) { - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes[n] >= MAX_BYTES) { - garbageCollect(); - } - - for(mem_iter iter = memory_maps[n].begin(); - iter != memory_maps[n].end(); ++iter) { - - mem_info info = iter->second; - - if (!info.mngr_lock && - !info.user_lock && - info.bytes == alloc_bytes) { - - iter->second.mngr_lock = true; - used_bytes[n] += alloc_bytes; - used_buffers[n]++; - return (T *)iter->first; - } - } - - // Perform garbage collection if memory can not be allocated - if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) { - garbageCollect(); - CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes)); - } - - mem_info info = {true, false, alloc_bytes}; - memory_maps[n][ptr] = info; - used_bytes[n] += alloc_bytes; - used_buffers[n]++; - total_bytes[n] += alloc_bytes; - } - return ptr; +void MemoryManagerPinned::nativeFree(void *ptr) +{ + cudaError_t err = cudaFreeHost(ptr); + if (err != cudaErrorCudartUnloading) { + CUDA_CHECK(err); } +} - template - void memFreeLocked(T *ptr, bool user_unlock) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find((void *)ptr); - - if (iter != memory_maps[n].end()) { - - iter->second.mngr_lock = false; - if ((iter->second.user_lock) && !user_unlock) return; - - iter->second.user_lock = false; +static MemoryManagerPinned &getMemoryManagerPinned() +{ + static MemoryManagerPinned instance; + return instance; +} - used_bytes[n] -= iter->second.bytes; - used_buffers[n]--; +void setMemStepSize(size_t step_bytes) +{ + getMemoryManager().setMemStepSize(step_bytes); +} - } else { - cudaFreeWrapper(ptr); // Free it because we are not sure what the size is - } - } +size_t getMemStepSize(void) +{ + return getMemoryManager().getMemStepSize(); +} - template - void memFree(T *ptr) - { - memFreeLocked(ptr, false); - } - template - void memLock(const T *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find((void *)ptr); +void garbageCollect() +{ + getMemoryManager().garbageCollect(); +} - if (iter != memory_maps[n].end()) { - iter->second.user_lock = true; - } else { +void printMemInfo(const char *msg, const int device) +{ + getMemoryManager().printInfo(msg, device); +} - mem_info info = { true, - true, - 100 }; //This number is not relevant +template +T* memAlloc(const size_t &elements) +{ + return (T *)getMemoryManager().alloc(elements * sizeof(T)); +} - memory_maps[n][(void *)ptr] = info; - } - } +template +void memFree(T *ptr) +{ + return getMemoryManager().unlock((void *)ptr, false); +} - template - void memUnlock(const T *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find((void *)ptr); - if (iter != memory_maps[n].end()) { - iter->second.user_lock = false; - } - } +template +void memFreeLocked(T *ptr, bool user_unlock) +{ + return getMemoryManager().unlock((void *)ptr, user_unlock); +} - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) - { - int n = getActiveDeviceId(); - if (alloc_bytes ) *alloc_bytes = total_bytes[n]; - if (alloc_buffers ) *alloc_buffers = memory_maps[n].size(); - if (lock_bytes ) *lock_bytes = used_bytes[n]; - if (lock_buffers ) *lock_buffers = used_buffers[n]; - } +template +void memLock(const T *ptr) +{ + getMemoryManager().userLock((void *)ptr); +} - ////////////////////////////////////////////////////////////////////////////// - mem_t pinned_maps; - static size_t pinned_used_bytes = 0; +template +void memUnlock(const T *ptr) +{ + getMemoryManager().userUnlock((void *)ptr); +} - void pinnedGarbageCollect() - { - for(mem_iter iter = pinned_maps.begin(); iter != pinned_maps.end(); ++iter) { - if (!(iter->second.mngr_lock)) { - pinnedFreeWrapper(iter->first); - } - } - - mem_iter memory_curr = pinned_maps.begin(); - mem_iter memory_end = pinned_maps.end(); - - while(memory_curr != memory_end) { - if (memory_curr->second.mngr_lock) { - ++memory_curr; - } else { - pinned_maps.erase(memory_curr++); - } - } - } - template - T* pinnedAlloc(const size_t &elements) - { - managerInit(); - T* ptr = NULL; - // Allocate the higher megabyte. Overhead of creating pinned memory is - // more so we want more resuable memory. - size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576; - - if (elements > 0) { - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) { - pinnedGarbageCollect(); - } - - for(mem_iter iter = pinned_maps.begin(); - iter != pinned_maps.end(); ++iter) { - - mem_info info = iter->second; - if (!info.mngr_lock && info.bytes == alloc_bytes) { - iter->second.mngr_lock = true; - pinned_used_bytes += alloc_bytes; - return (T *)iter->first; - } - } - - // Perform garbage collection if memory can not be allocated - if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) { - pinnedGarbageCollect(); - CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes)); - } - - mem_info info = {true, false, alloc_bytes}; - pinned_maps[ptr] = info; - pinned_used_bytes += alloc_bytes; - } - return (T*)ptr; - } +void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers, + lock_bytes, lock_buffers); +} - template - void pinnedFree(T *ptr) - { - mem_iter iter = pinned_maps.find((void *)ptr); - - if (iter != pinned_maps.end()) { - iter->second.mngr_lock = false; - pinned_used_bytes -= iter->second.bytes; - } else { - pinnedFreeWrapper(ptr); // Free it because we are not sure what the size is - } - } +template +T* pinnedAlloc(const size_t &elements) +{ + return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T)); +} -#endif +template +void pinnedFree(T* ptr) +{ + return getMemoryManagerPinned().unlock((void *)ptr, false); +} #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index 29e4e76597..5b362cd587 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -9,6 +9,7 @@ #pragma once #include + namespace cuda { template T* memAlloc(const size_t &elements); diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index b75955efd9..45b8e96ba4 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -14,386 +14,221 @@ #include #include #include +#include "err_opencl.hpp" -namespace opencl -{ - static size_t memory_resolution = 1024; //1KB - - void setMemStepSize(size_t step_bytes) - { - memory_resolution = step_bytes; - } +#include - size_t getMemStepSize(void) - { - return memory_resolution; - } - - // Manager Class - // Dummy used to call garbage collection at the end of the program - class Manager - { - public: - static bool initialized; - Manager() - { - initialized = true; - } - - ~Manager() - { - for(int i = 0; i < (int)getDeviceCount(); i++) { - setDevice(i); - garbageCollect(); - pinnedGarbageCollect(); - } - } - }; - - bool Manager::initialized = false; - - static void managerInit() - { - if(Manager::initialized == false) - static Manager pm = Manager(); - } - - typedef struct - { - bool mngr_lock; - bool user_lock; - size_t bytes; - } mem_info; +#ifndef AF_MEM_DEBUG +#define AF_MEM_DEBUG 0 +#endif - static size_t used_bytes[DeviceManager::MAX_DEVICES] = {0}; - static size_t used_buffers[DeviceManager::MAX_DEVICES] = {0}; - static size_t total_bytes[DeviceManager::MAX_DEVICES] = {0}; +#ifndef AF_OPENCL_MEM_DEBUG +#define AF_OPENCL_MEM_DEBUG 0 +#endif - typedef std::map mem_t; - typedef mem_t::iterator mem_iter; - mem_t memory_maps[DeviceManager::MAX_DEVICES]; +namespace opencl +{ - static void destroy(cl::Buffer *ptr) +class MemoryManager : public common::MemoryManager +{ + int getActiveDeviceId(); +public: + MemoryManager(); + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); + ~MemoryManager() { - delete ptr; + common::lock_guard_t lock(this->memory_mutex); + this->garbageCollect(); } +}; - void garbageCollect() - { - int n = getActiveDeviceId(); - for(mem_iter iter = memory_maps[n].begin(); - iter != memory_maps[n].end(); ++iter) { - - if (!(iter->second).mngr_lock) { +class MemoryManagerPinned : public common::MemoryManager +{ + std::vector< + std::map + > pinned_maps; + int getActiveDeviceId(); - if (!(iter->second).user_lock) { - destroy(iter->first); - total_bytes[n] -= iter->second.bytes; - } - } - } +public: - mem_iter memory_curr = memory_maps[n].begin(); - mem_iter memory_end = memory_maps[n].end(); + MemoryManagerPinned(); - while(memory_curr != memory_end) { - if (memory_curr->second.mngr_lock || memory_curr->second.user_lock) { - ++memory_curr; - } else { - memory_maps[n].erase(memory_curr++); - } - } - } + void *nativeAlloc(const size_t bytes); + void nativeFree(void *ptr); - void printMemInfo(const char *msg, const int device) + ~MemoryManagerPinned() { - std::cout << msg << std::endl; - std::cout << "Memory Map for Device: " << device << std::endl; - - static const std::string head("| POINTER | SIZE | AF LOCK | USER LOCK |"); - static const std::string line(head.size(), '-'); - std::cout << line << std::endl << head << std::endl << line << std::endl; - - for(mem_iter iter = memory_maps[device].begin(); - iter != memory_maps[device].end(); ++iter) { - - std::string status_mngr("Unknown"); - std::string status_user("Unknown"); - - if(iter->second.mngr_lock) status_mngr = "Yes"; - else status_mngr = " No"; - - if(iter->second.user_lock) status_user = "Yes"; - else status_user = " No"; - - std::string unit = "KB"; - double size = (double)(iter->second.bytes) / 1024; - if(size >= 1024) { - size = size / 1024; - unit = "MB"; + common::lock_guard_t lock(this->memory_mutex); + this->garbageCollect(); + for (int n = 0; n < (int)pinned_maps.size(); n++) { + auto pinned_curr_iter = pinned_maps[n].begin(); + auto pinned_end_iter = pinned_maps[n].end(); + while (pinned_curr_iter != pinned_end_iter) { + pinned_maps[n].erase(pinned_curr_iter++); } - - std::cout << "| " << std::right << std::setw(14) << iter->first << " " - << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit - << " | " << std::setw(9) << status_mngr - << " | " << std::setw(9) << status_user - << " |" << std::endl; } - - std::cout << line << std::endl; } +}; - cl::Buffer *bufferAlloc(const size_t &bytes) - { - int n = getActiveDeviceId(); - cl::Buffer *ptr = NULL; - size_t alloc_bytes = divup(bytes, memory_resolution) * memory_resolution; - - if (bytes > 0) { - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes[n] >= MAX_BYTES) { - garbageCollect(); - } - - for(mem_iter iter = memory_maps[n].begin(); - iter != memory_maps[n].end(); ++iter) { - - mem_info info = iter->second; - - if (!info.mngr_lock && - !info.user_lock && - info.bytes == alloc_bytes) { - - iter->second.mngr_lock = true; - used_bytes[n] += alloc_bytes; - used_buffers[n]++; - return iter->first; - } - } +int MemoryManager::getActiveDeviceId() +{ + return opencl::getActiveDeviceId(); +} - try { - ptr = new cl::Buffer(getContext(), CL_MEM_READ_WRITE, alloc_bytes); - } catch(...) { - garbageCollect(); - ptr = new cl::Buffer(getContext(), CL_MEM_READ_WRITE, alloc_bytes); - } +MemoryManager::MemoryManager() : + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG) +{} - mem_info info = {true, false, alloc_bytes}; - memory_maps[n][ptr] = info; - used_bytes[n] += alloc_bytes; - used_buffers[n]++; - total_bytes[n] += alloc_bytes; - } - return ptr; - } - - void bufferFree(cl::Buffer *ptr) - { - bufferFreeLocked(ptr, false); +void *MemoryManager::nativeAlloc(const size_t bytes) +{ + try { + return (void *)(new cl::Buffer(getContext(), CL_MEM_READ_WRITE, bytes)); + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); } +} - void bufferFreeLocked(cl::Buffer *ptr, bool user_unlock) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find(ptr); - - if (iter != memory_maps[n].end()) { - - iter->second.mngr_lock = false; - if ((iter->second).user_lock && !user_unlock) return; - - iter->second.user_lock = false; - - used_bytes[n] -= iter->second.bytes; - used_buffers[n]--; - } else { - destroy(ptr); // Free it because we are not sure what the size is - } +void MemoryManager::nativeFree(void *ptr) +{ + try { + delete (cl::Buffer *)ptr; + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); } +} - void bufferPop(cl::Buffer *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find(ptr); +static MemoryManager &getMemoryManager() +{ + static MemoryManager instance; + return instance; +} - if (iter != memory_maps[n].end()) { - iter->second.user_lock = true; - } else { +int MemoryManagerPinned::getActiveDeviceId() +{ + return opencl::getActiveDeviceId(); +} - mem_info info = { true, - true, - 100 }; //This number is not relevant +MemoryManagerPinned::MemoryManagerPinned() : + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG), + pinned_maps(getDeviceCount()) +{} - memory_maps[n][ptr] = info; - } - } +void *MemoryManagerPinned::nativeAlloc(const size_t bytes) +{ + void *ptr = NULL; + try { + cl::Buffer buf= cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, bytes); + ptr = getQueue().enqueueMapBuffer(buf, true, CL_MAP_READ | CL_MAP_WRITE, 0, bytes); + pinned_maps[opencl::getActiveDeviceId()][ptr] = buf; + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); + } + return ptr; +} - void bufferPush(cl::Buffer *ptr) - { - int n = getActiveDeviceId(); - mem_iter iter = memory_maps[n].find(ptr); +void MemoryManagerPinned::nativeFree(void *ptr) +{ + try { + int n = opencl::getActiveDeviceId(); + auto iter = pinned_maps[n].find(ptr); - if (iter != memory_maps[n].end()) { - iter->second.user_lock = false; + if (iter != pinned_maps[n].end()) { + getQueue().enqueueUnmapMemObject(pinned_maps[n][ptr], ptr); + pinned_maps[n].erase(iter); } - } - - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) - { - int n = getActiveDeviceId(); - if (alloc_bytes ) *alloc_bytes = total_bytes[n]; - if (alloc_buffers ) *alloc_buffers = memory_maps[n].size(); - if (lock_bytes ) *lock_bytes = used_bytes[n]; - if (lock_buffers ) *lock_buffers = used_buffers[n]; - } - template - T *memAlloc(const size_t &elements) - { - managerInit(); - return (T *)bufferAlloc(elements * sizeof(T)); - } - - template - void memFree(T *ptr) - { - return bufferFreeLocked((cl::Buffer *)ptr, false); - } - - template - void memFreeLocked(T *ptr, bool user_unlock) - { - return bufferFreeLocked((cl::Buffer *)ptr, user_unlock); - } - - template - void memLock(const T *ptr) - { - return bufferPop((cl::Buffer *)ptr); - } - - template - void memUnlock(const T *ptr) - { - return bufferPush((cl::Buffer *)ptr); + } catch(cl::Error err) { + CL_TO_AF_ERROR(err); } +} - // pinned memory manager - typedef struct { - cl::Buffer *buf; - mem_info info; - } pinned_info; +static MemoryManagerPinned &getMemoryManagerPinned() +{ + static MemoryManagerPinned instance; + return instance; +} - typedef std::map pinned_t; - typedef pinned_t::iterator pinned_iter; - pinned_t pinned_maps[DeviceManager::MAX_DEVICES]; - static size_t pinned_used_bytes = 0; +void setMemStepSize(size_t step_bytes) +{ + getMemoryManager().setMemStepSize(step_bytes); +} - static void pinnedDestroy(cl::Buffer *buf, void *ptr) - { - getQueue().enqueueUnmapMemObject(*buf, (void *)ptr); - destroy(buf); - } +size_t getMemStepSize(void) +{ + return getMemoryManager().getMemStepSize(); +} - void pinnedGarbageCollect() - { - int n = getActiveDeviceId(); - for(auto &iter : pinned_maps[n]) { - if (!(iter.second).info.mngr_lock) { - pinnedDestroy(iter.second.buf, iter.first); - } - } - pinned_iter memory_curr = pinned_maps[n].begin(); - pinned_iter memory_end = pinned_maps[n].end(); +void garbageCollect() +{ + getMemoryManager().garbageCollect(); +} - while(memory_curr != memory_end) { - if (memory_curr->second.info.mngr_lock) { - ++memory_curr; - } else { - memory_curr = pinned_maps[n].erase(memory_curr); - } - } +void printMemInfo(const char *msg, const int device) +{ + getMemoryManager().printInfo(msg, device); +} - } +template +T* memAlloc(const size_t &elements) +{ + return (T *)getMemoryManager().alloc(elements * sizeof(T)); +} - void *pinnedBufferAlloc(const size_t &bytes) - { - void *ptr = NULL; - int n = getActiveDeviceId(); - // Allocate the higher megabyte. Overhead of creating pinned memory is - // more so we want more resuable memory. - size_t alloc_bytes = divup(bytes, 1048576) * 1048576; - - if (bytes > 0) { - cl::Buffer *buf = NULL; - - // FIXME: Add better checks for garbage collection - // Perhaps look at total memory available as a metric - if (pinned_maps[n].size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) { - pinnedGarbageCollect(); - } +cl::Buffer *bufferAlloc(const size_t &bytes) +{ + return (cl::Buffer *)getMemoryManager().alloc(bytes); +} - for(pinned_iter iter = pinned_maps[n].begin(); - iter != pinned_maps[n].end(); ++iter) { +template +void memFree(T *ptr) +{ + return getMemoryManager().unlock((void *)ptr, false); +} - mem_info info = iter->second.info; - if (!info.mngr_lock && info.bytes == alloc_bytes) { - iter->second.info.mngr_lock = true; - pinned_used_bytes += alloc_bytes; - return iter->first; - } - } +void bufferFree(cl::Buffer *buf) +{ + return getMemoryManager().unlock((void *)buf, false); +} - try { - buf = new cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, alloc_bytes); +template +void memFreeLocked(T *ptr, bool user_unlock) +{ + return getMemoryManager().unlock((void *)ptr, user_unlock); +} - ptr = getQueue().enqueueMapBuffer(*buf, true, CL_MAP_READ|CL_MAP_WRITE, - 0, alloc_bytes); - } catch(...) { - pinnedGarbageCollect(); - buf = new cl::Buffer(getContext(), CL_MEM_ALLOC_HOST_PTR, alloc_bytes); +template +void memLock(const T *ptr) +{ + getMemoryManager().userLock((void *)ptr); +} - ptr = getQueue().enqueueMapBuffer(*buf, true, CL_MAP_READ|CL_MAP_WRITE, - 0, alloc_bytes); - } - mem_info info = {true, false, alloc_bytes}; - pinned_info pt = {buf, info}; - pinned_maps[n][ptr] = pt; - pinned_used_bytes += alloc_bytes; - } - return ptr; - } +template +void memUnlock(const T *ptr) +{ + getMemoryManager().userUnlock((void *)ptr); +} - void pinnedBufferFree(void *ptr) - { - int n = getActiveDeviceId(); - pinned_iter iter = pinned_maps[n].find(ptr); - if (iter != pinned_maps[n].end()) { - iter->second.info.mngr_lock = false; - pinned_used_bytes -= iter->second.info.bytes; - } else { - pinnedDestroy(iter->second.buf, ptr); // Free it because we are not sure what the size is - pinned_maps[n].erase(iter); - } - } +void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + getMemoryManager().bufferInfo(alloc_bytes, alloc_buffers, + lock_bytes, lock_buffers); +} - template - T* pinnedAlloc(const size_t &elements) - { - managerInit(); - return (T *)pinnedBufferAlloc(elements * sizeof(T)); - } +template +T* pinnedAlloc(const size_t &elements) +{ + return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T)); +} - template - void pinnedFree(T* ptr) - { - return pinnedBufferFree((void *) ptr); - } +template +void pinnedFree(T* ptr) +{ + return getMemoryManagerPinned().unlock((void *)ptr, false); +} #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index dce142805a..da27e0d8d5 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -15,12 +15,7 @@ namespace opencl { cl::Buffer *bufferAlloc(const size_t &bytes); - - // Need these as 2 separate function and not a default argument - // This is because it is used as the deleter in shared pointer - // which cannot support default arguments void bufferFree(cl::Buffer *buf); - void bufferFreeLocked(cl::Buffer *buf, bool user_unlock); template T *memAlloc(const size_t &elements); From a1754327e4223ebad411699e2833aaf890861846 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 13:58:55 -0500 Subject: [PATCH 171/288] Remove unnecessary line from CMakeLists --- src/backend/cuda/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index fc9a75cb12..e13f8274f7 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -60,7 +60,6 @@ IF(UNIX) # Forcing STRICT ANSI should resolve a bunch of issues that NVIDIA seems to face with GCC compilers. ADD_DEFINITIONS(-D__STRICT_ANSI__) SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fvisibility=hidden) - REMOVE_DEFINITIONS(-std=c++0x) IF(${WITH_COVERAGE}) SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fprofile-arcs -Xcompiler -ftest-coverage -Xlinker -fprofile-arcs -Xlinker -ftest-coverage") ENDIF(${WITH_COVERAGE}) From 43d030dfd3e69082555af743b0b8b395c6ce00d1 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 13:59:31 -0500 Subject: [PATCH 172/288] Cleaning up error messages in loading and saving files --- src/api/c/stream.cpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/api/c/stream.cpp b/src/api/c/stream.cpp index a7b5771ee0..17cc945520 100644 --- a/src/api/c/stream.cpp +++ b/src/api/c/stream.cpp @@ -249,12 +249,17 @@ static af_array checkVersionAndRead(const char *filename, const unsigned index) { char version = 0; - std::fstream fs(filename, std::fstream::in | std::fstream::binary); + std::string filenameStr = std::string(filename); + std::fstream fs(filenameStr, std::fstream::in | std::fstream::binary); // Throw exception if file is not open - if(!fs.is_open()) AF_ERROR("File failed to open", AF_ERR_ARG); + if(!fs.is_open()) { + std::string errStr = "Failed to open: " + filenameStr; + AF_ERROR(errStr.c_str(), AF_ERR_ARG); + } if(fs.peek() == std::fstream::traits_type::eof()) { - AF_ERROR("File is empty", AF_ERR_ARG); + std::string errStr = filenameStr + " is empty"; + AF_ERROR(errStr.c_str(), AF_ERR_ARG); } else { fs.read(&version, sizeof(char)); } @@ -270,13 +275,18 @@ int checkVersionAndFindIndex(const char *filename, const char *k) { char version = 0; std::string key(k); + std::string filenameStr(filename); + std::ifstream fs(filenameStr, std::ifstream::in | std::ifstream::binary); - std::ifstream fs(filename, std::ifstream::in | std::ifstream::binary); // Throw exception if file is not open - if(!fs.is_open()) AF_ERROR("File failed to open", AF_ERR_ARG); + if(!fs.is_open()) { + std::string errStr = "Failed to open: " + filenameStr; + AF_ERROR(errStr.c_str(), AF_ERR_ARG); + } if(fs.peek() == std::ifstream::traits_type::eof()) { - AF_ERROR("File is empty", AF_ERR_ARG); + std::string errStr = filenameStr + " is empty"; + AF_ERROR(errStr.c_str(), AF_ERR_ARG); } else { fs.read(&version, sizeof(char)); } From 73b7cacb0c8126ea7224062ca923e40cb2c1a9e7 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 14:05:14 -0500 Subject: [PATCH 173/288] Fixing CUDA platform manager to sort devices in a more saner manner. --- src/backend/cuda/platform.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index f5f6599419..72fc0bc75d 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -62,13 +62,13 @@ static inline int compute2cores(int major, int minor) return 0; } -// compare two cards based on (in order): -// 1. flops (theoretical) -// 2. total memory - +// Return true if greater, false if lesser. +// if equal, it continues to next comparison #define COMPARE(a,b,f) do { \ - return ((a)->f >= (b)->f); \ - } while (0); + if ((a)->f > (b)->f) return true; \ + if ((a)->f < (b)->f) return false; \ + break; \ + } while (0) static inline bool card_compare_compute(const cudaDevice_t &l, const cudaDevice_t &r) @@ -81,7 +81,7 @@ static inline bool card_compare_compute(const cudaDevice_t &l, const cudaDevice_ COMPARE(lc, rc, flops); COMPARE(lc, rc, prop.totalGlobalMem); COMPARE(lc, rc, nativeId); - return 0; + return false; } static inline bool card_compare_flops(const cudaDevice_t &l, const cudaDevice_t &r) @@ -94,7 +94,7 @@ static inline bool card_compare_flops(const cudaDevice_t &l, const cudaDevice_t COMPARE(lc, rc, prop.major); COMPARE(lc, rc, prop.minor); COMPARE(lc, rc, nativeId); - return 0; + return false; } static inline bool card_compare_mem(const cudaDevice_t &l, const cudaDevice_t &r) @@ -107,7 +107,7 @@ static inline bool card_compare_mem(const cudaDevice_t &l, const cudaDevice_t &r COMPARE(lc, rc, prop.major); COMPARE(lc, rc, prop.minor); COMPARE(lc, rc, nativeId); - return 0; + return false; } static inline bool card_compare_num(const cudaDevice_t &l, const cudaDevice_t &r) @@ -116,7 +116,7 @@ static inline bool card_compare_num(const cudaDevice_t &l, const cudaDevice_t &r const cudaDevice_t *rc = &r; COMPARE(lc, rc, nativeId); - return 0; + return false; } static const std::string get_system(void) @@ -370,16 +370,16 @@ void DeviceManager::sortDevices(sort_mode mode) { switch(mode) { case memory : - sort(cuDevices.begin(), cuDevices.end(), card_compare_mem); + std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_mem); break; case flops : - sort(cuDevices.begin(), cuDevices.end(), card_compare_flops); + std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_flops); break; case compute : - sort(cuDevices.begin(), cuDevices.end(), card_compare_compute); + std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_compute); break; case none : default : - sort(cuDevices.begin(), cuDevices.end(), card_compare_num); + std::stable_sort(cuDevices.begin(), cuDevices.end(), card_compare_num); break; } } From d75b899d2e4dc88607fe69bbcdfdebf20a764765 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 14:16:55 -0500 Subject: [PATCH 174/288] Adding lock to memory allocated using af_alloc_device / af::alloc --- include/af/array.h | 4 +++- include/af/device.h | 10 ++++++++++ src/api/c/device.cpp | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/af/array.h b/include/af/array.h index 03f3eeb23a..de746d9384 100644 --- a/include/af/array.h +++ b/include/af/array.h @@ -672,6 +672,8 @@ namespace af Get the device pointer from the array and lock the buffer in memory manager. @{ + The device memory returned by this function is not freed until unlock() is called. + \ingroup arrayfire_func \ingroup device_mat */ @@ -961,7 +963,7 @@ namespace af /// \brief Locks the device buffer in the memory manager. /// /// This method can be called to take control of the device pointer from the memory manager. - /// While a buffer is locked, the memory manager does not free the memory. + /// While a buffer is locked, the memory manager doesn't free the memory until unlock() is invoked. void lock() const; /// diff --git a/include/af/device.h b/include/af/device.h index 4a3006ffc7..28830675f8 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -108,6 +108,8 @@ namespace af /// \param[in] type is the type of the elements to allocate /// \returns the pointer to the memory /// + /// \note The device memory returned by this function is only freed if af::free() is called explicitly + AFAPI void *alloc(const size_t elements, const dtype type); /// \brief Allocates memory using ArrayFire's memory manager @@ -118,6 +120,8 @@ namespace af /// /// \note the size of the memory allocated is the number of \p elements * /// sizeof(type) + /// + /// \note The device memory returned by this function is only freed if af::free() is called explicitly template T* alloc(const size_t elements); /// @} @@ -126,6 +130,8 @@ namespace af /// /// \copydoc device_func_free /// \param[in] ptr the memory to free + /// + /// This function will free a device pointer even if it has been previously locked. AFAPI void free(const void *ptr); /// \ingroup device_func_pinned @@ -292,11 +298,15 @@ extern "C" { /** \ingroup device_func_alloc + + This device memory returned by this function can only be freed using af_free_device */ AFAPI af_err af_alloc_device(void **ptr, const dim_t bytes); /** \ingroup device_func_free + + This function will free a device pointer even if it has been previously locked. */ AFAPI af_err af_free_device(void *ptr); diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index c37e2934ae..24a8ad5d00 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -298,6 +298,7 @@ af_err af_alloc_device(void **ptr, const dim_t bytes) try { AF_CHECK(af_init()); *ptr = (void *)memAlloc(bytes); + memLock((const char *)*ptr); } CATCHALL; return AF_SUCCESS; } From d5f3bf13f7c687519ec4309392051b4d88f44a4b Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 15:00:25 -0500 Subject: [PATCH 175/288] Adding documentation for AF_MEM_DEBUG --- docs/pages/configuring_arrayfire_environment.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/pages/configuring_arrayfire_environment.md b/docs/pages/configuring_arrayfire_environment.md index 7e197e4954..37327ac93c 100644 --- a/docs/pages/configuring_arrayfire_environment.md +++ b/docs/pages/configuring_arrayfire_environment.md @@ -97,3 +97,15 @@ detailed. This helps in locating the exact failure. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AF_PRINT_ERRORS=1 ./myprogram_opencl ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +AF_MEM_DEBUG (#af_mem_debug) +------------------------------------------------------------------------------- + +When AF_MEM_DEBUG is set to 1 (or anything not equal to 0), the caching mechanism in the memory manager. +The device buffers are allocated using native functions as needed and freed when going out of scope. + +When the environment variable is not set, it is treated to be non zero. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +AF_MEM_DEBUG=1 ./myprogram +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From f9a83360e2443476357a50763010f476ac11fb48 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 15:01:09 -0500 Subject: [PATCH 176/288] Additional sanitizing for mutex locks Use std::recursive_mutex instead of std::mutex for the cases when a mutex lock is called from within another call. Make lock_guard the first call to all the functions --- src/backend/MemoryManager.cpp | 11 ++++++----- src/backend/MemoryManager.hpp | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index 621ce624e7..696c9af621 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -25,6 +25,7 @@ MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, unsigned MAX memory(num_devices), debug_mode(debug) { + lock_guard_t lock(this->memory_mutex); std::string env_var = getEnvVar("AF_MEM_DEBUG"); if (!env_var.empty()) { this->debug_mode = env_var[0] != '0'; @@ -36,6 +37,7 @@ void MemoryManager::garbageCollect() { if (this->debug_mode) return; + lock_guard_t lock(this->memory_mutex); memory_info& current = this->getCurrentMemoryInfo(); for(buffer_iter iter = current.map.begin(); @@ -66,8 +68,8 @@ void MemoryManager::garbageCollect() void MemoryManager::unlock(void *ptr, bool user_unlock) { - memory_info& current = this->getCurrentMemoryInfo(); lock_guard_t lock(this->memory_mutex); + memory_info& current = this->getCurrentMemoryInfo(); buffer_iter iter = current.map.find((void *)ptr); @@ -93,14 +95,13 @@ void MemoryManager::unlock(void *ptr, bool user_unlock) void *MemoryManager::alloc(const size_t bytes) { - memory_info& current = this->getCurrentMemoryInfo(); + lock_guard_t lock(this->memory_mutex); void *ptr = NULL; size_t alloc_bytes = this->debug_mode ? bytes : (divup(bytes, mem_step_size) * mem_step_size); if (bytes > 0) { - - lock_guard_t lock(this->memory_mutex); + memory_info& current = this->getCurrentMemoryInfo(); // There is no memory cache in debug mode if (!this->debug_mode) { @@ -240,8 +241,8 @@ void MemoryManager::printInfo(const char *msg, const int device) void MemoryManager::bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers) { - memory_info current = this->getCurrentMemoryInfo(); lock_guard_t lock(this->memory_mutex); + memory_info current = this->getCurrentMemoryInfo(); if (alloc_bytes ) *alloc_bytes = current.total_bytes; if (alloc_buffers ) *alloc_buffers = current.map.size(); if (lock_bytes ) *lock_bytes = current.lock_bytes; diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp index 1f87ea2dfe..cfcc60f2bb 100644 --- a/src/backend/MemoryManager.hpp +++ b/src/backend/MemoryManager.hpp @@ -16,8 +16,8 @@ namespace common { -typedef std::mutex mutex_t; -typedef std::lock_guard lock_guard_t; +typedef std::recursive_mutex mutex_t; +typedef std::lock_guard lock_guard_t; class MemoryManager { From 0638f3f0d99ac0297031ffb00006e2e72fc9d297 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 15:28:02 -0500 Subject: [PATCH 177/288] Removing unnecessary returns from void functions --- src/backend/MemoryManager.hpp | 2 +- src/backend/cpu/memory.cpp | 2 +- src/backend/cuda/memory.cpp | 2 +- src/backend/opencl/memory.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp index cfcc60f2bb..5de8e4d823 100644 --- a/src/backend/MemoryManager.hpp +++ b/src/backend/MemoryManager.hpp @@ -84,7 +84,7 @@ class MemoryManager virtual void nativeFree(void *ptr) { - return free((void *)ptr); + free((void *)ptr); } virtual ~MemoryManager() diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 2687b3018b..4af348692f 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -102,7 +102,7 @@ void memFree(T *ptr) template void memFreeLocked(T *ptr, bool user_unlock) { - return getMemoryManager().unlock((void *)ptr, user_unlock); + getMemoryManager().unlock((void *)ptr, user_unlock); } template diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 43c37e016f..a3e995f2e7 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -158,7 +158,7 @@ void memFree(T *ptr) template void memFreeLocked(T *ptr, bool user_unlock) { - return getMemoryManager().unlock((void *)ptr, user_unlock); + getMemoryManager().unlock((void *)ptr, user_unlock); } template diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 45b8e96ba4..9e1344d7ba 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -195,7 +195,7 @@ void bufferFree(cl::Buffer *buf) template void memFreeLocked(T *ptr, bool user_unlock) { - return getMemoryManager().unlock((void *)ptr, user_unlock); + getMemoryManager().unlock((void *)ptr, user_unlock); } template From 1520dc3bea5185d5ac000b18d93f8a51cd1d6e39 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 16:36:18 -0500 Subject: [PATCH 178/288] Fixing issue where garbageCollect was only called on current device --- src/backend/cpu/memory.cpp | 5 ++++- src/backend/cuda/memory.cpp | 10 ++++++++-- src/backend/opencl/memory.cpp | 10 +++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 4af348692f..c387b68b71 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -36,7 +36,10 @@ class MemoryManager : public common::MemoryManager ~MemoryManager() { common::lock_guard_t lock(this->memory_mutex); - this->garbageCollect(); + for (int n = 0; n < getDeviceCount(); n++) { + cpu::setDevice(n); + this->garbageCollect(); + } } }; diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index a3e995f2e7..0e3fb5afde 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -44,7 +44,10 @@ class MemoryManager : public common::MemoryManager ~MemoryManager() { common::lock_guard_t lock(this->memory_mutex); - this->garbageCollect(); + for (int n = 0; n < getDeviceCount(); n++) { + cuda::setDevice(n); + this->garbageCollect(); + } } }; @@ -58,7 +61,10 @@ class MemoryManagerPinned : public common::MemoryManager ~MemoryManagerPinned() { common::lock_guard_t lock(this->memory_mutex); - this->garbageCollect(); + for (int n = 0; n < getDeviceCount(); n++) { + cuda::setDevice(n); + this->garbageCollect(); + } } }; diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 9e1344d7ba..8a48a48c02 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -39,7 +39,10 @@ class MemoryManager : public common::MemoryManager ~MemoryManager() { common::lock_guard_t lock(this->memory_mutex); - this->garbageCollect(); + for (int n = 0; n < getDeviceCount(); n++) { + opencl::setDevice(n); + this->garbageCollect(); + } } }; @@ -60,8 +63,9 @@ class MemoryManagerPinned : public common::MemoryManager ~MemoryManagerPinned() { common::lock_guard_t lock(this->memory_mutex); - this->garbageCollect(); - for (int n = 0; n < (int)pinned_maps.size(); n++) { + for (int n = 0; n < getDeviceCount(); n++) { + opencl::setDevice(n); + this->garbageCollect(); auto pinned_curr_iter = pinned_maps[n].begin(); auto pinned_end_iter = pinned_maps[n].end(); while (pinned_curr_iter != pinned_end_iter) { From aaf554e7162df65bb90412cf08a95a30ebbc43d3 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 17:04:20 -0500 Subject: [PATCH 179/288] BUGFIX: Initialize buffer counts to 0 --- src/backend/MemoryManager.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index 696c9af621..cea4ae6b76 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -31,6 +31,12 @@ MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, unsigned MAX this->debug_mode = env_var[0] != '0'; } if (this->debug_mode) mem_step_size = 1; + + for (int n = 0; n < num_devices; n++) { + memory[n].total_bytes = 0; + memory[n].lock_bytes = 0; + memory[n].lock_buffers = 0; + } } void MemoryManager::garbageCollect() From 9d0c159d249e512f86c9c8a79f483213772abaa5 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 11 Jan 2016 17:04:38 -0500 Subject: [PATCH 180/288] af_set_device now only warns when device > 0 on CPU --- src/backend/cpu/platform.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index 19942f0312..0039b208d9 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -237,11 +237,11 @@ int getDeviceCount() int setDevice(int device) { static bool flag; - if(!flag) { - printf("WARNING: af_set_device not supported for CPU\n"); + if(!flag && device != 0) { + printf("WARNING af_set_device(device): device can only be 0 for CPU\n"); flag = 1; } - return 1; + return 0; } int getActiveDeviceId() From db14451e7e2ac0784fc7ac475b65feb8697bf48a Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Sun, 10 Jan 2016 17:26:46 -0500 Subject: [PATCH 181/288] Re-enable disabled sort tests from issue #995 --- test/sort_by_key.cpp | 5 ++--- test/sort_index.cpp | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp index 3d82b9fd90..289e407ad9 100644 --- a/test/sort_by_key.cpp +++ b/test/sort_by_key.cpp @@ -116,9 +116,8 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(Sort10x10False, sort_by_key_2D, false, 2, 3); SORT_INIT(Sort1000True, sort_by_key_1000, true, 0, 1); SORT_INIT(SortMedTrue, sort_by_key_med, true, 0, 1); - // FIXME: below two tests are disabled temporarily until issue#995 is fixed - //SORT_INIT(Sort1000False, sort_by_key_1000, false, 2, 3); - //SORT_INIT(SortMedFalse, sort_by_key_med, false, 2, 3); + SORT_INIT(Sort1000False, sort_by_key_1000, false, 2, 3); + SORT_INIT(SortMedFalse, sort_by_key_med, false, 2, 3); // Takes too much time in current implementation. Enable when everything is parallel //SORT_INIT(SortLargeTrue, sort_by_key_large, true, 0, 1); //SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3); diff --git a/test/sort_index.cpp b/test/sort_index.cpp index 0711e8b494..abe7910a58 100644 --- a/test/sort_index.cpp +++ b/test/sort_index.cpp @@ -117,9 +117,8 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(Sort10x10False, sort_10x10, false, 2, 3); SORT_INIT(Sort1000True, sort_1000, true, 0, 1); SORT_INIT(SortMedTrue, sort_med1, true, 0, 1); - // FIXME: below two tests are disabled temporarily until issue#995 is fixed - //SORT_INIT(Sort1000False, sort_1000, false, 2, 3); - //SORT_INIT(SortMedFalse, sort_med1, false, 2, 3); + SORT_INIT(Sort1000False, sort_1000, false, 2, 3); + SORT_INIT(SortMedFalse, sort_med1, false, 2, 3); // Takes too much time in current implementation. Enable when everything is parallel //SORT_INIT(SortMed5True, sort_med, true, 0, 1); //SORT_INIT(SortMed5False, sort_med, false, 2, 3); From 6da71e59db4d2839585c63187ec7dc7a7d4dec2d Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 11 Jan 2016 14:22:28 -0500 Subject: [PATCH 182/288] BUGFIX Handle 16-bit data in saveImage --- src/api/c/imageio.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp index e372cd7e64..5471305cee 100644 --- a/src/api/c/imageio.cpp +++ b/src/api/c/imageio.cpp @@ -299,9 +299,16 @@ af_err af_save_image(const char* filename, const af_array in_) AF_CHECK(af_mul(&in, in_, c255, false)); AF_CHECK(af_release_array(c255)); free_in = true; - } else { + } else if(max_real < 256) { in = in_; } + else if (max_real < 65536) { + af_array c255 = 0; + AF_CHECK(af_constant(&c255, 257.0, info.ndims(), info.dims().get(), f32)); + AF_CHECK(af_div(&in, in_, c255, false)); + AF_CHECK(af_release_array(c255)); + free_in = true; + } // FI = row major | AF = column major uint nDstPitch = FreeImage_GetPitch(pResultBitmap); From b14ae20f39c1bb41aa6298da6f6cc4a628b10c5f Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 11 Jan 2016 14:26:27 -0500 Subject: [PATCH 183/288] Fix saveImageNative for 1-channel images --- src/api/c/imageio2.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp index a1374a2944..ff7a4a8d34 100644 --- a/src/api/c/imageio2.cpp +++ b/src/api/c/imageio2.cpp @@ -237,7 +237,7 @@ static void save_t(T* pDstLine, const af_array in, const dim4 dims, uint nDstPit for (uint y = 0; y < fi_h; ++y) { for (uint x = 0; x < fi_w; ++x) { if(channels == 1) { - *(pDstLine + x * step + FI_RGBA_RED) = (T) pSrc0[indx]; // r -> 0 + *(pDstLine + x * step) = (T) pSrc0[indx]; // r -> 0 } else if(channels >=3) { if((af_dtype) af::dtype_traits::af_type == u8) { *(pDstLine + x * step + FI_RGBA_RED ) = (T) pSrc0[indx]; // r -> 0 From a6a4cdbc1c0688e9eb72fb09332962a2ac2beaf5 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 11 Jan 2016 14:31:18 -0500 Subject: [PATCH 184/288] Update test data submodule commit --- test/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data b/test/data index 4a735db351..d134732012 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit 4a735db3515db3f8f914e0b69fa2e11add9cd50f +Subproject commit d1347320125a0315a4ef03e63630b5b3249d189d From 88cf4713e3040450e0200faa1d65ac9079296243 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 11 Jan 2016 14:31:57 -0500 Subject: [PATCH 185/288] Add tests for 16-bit images for ImageIO+Native --- test/imageio.cpp | 140 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 3 deletions(-) diff --git a/test/imageio.cpp b/test/imageio.cpp index d19aac346c..4029de5a1b 100644 --- a/test/imageio.cpp +++ b/test/imageio.cpp @@ -36,8 +36,6 @@ typedef ::testing::Types TestTypes; // register the type list TYPED_TEST_CASE(ImageIO, TestTypes); -// Disable tests if FreeImage is not found -#if defined(WITH_FREEIMAGE) void loadImageTest(string pTestFile, string pImageFile, const bool isColor) { if (noDoubleTests()) return; @@ -251,4 +249,140 @@ TEST(ImageMem, SaveMemBMP) af::deleteImageMem(savedMem); } -#endif // WITH_FREEIMAGE +TEST(ImageIO, LoadImage16CPP) +{ + if (noImageIOTests()) return; + + vector numDims; + + vector > in; + vector > tests; + readTests(string(TEST_DIR"/imageio/color_seq_16.test"),numDims,in,tests); + + af::dim4 dims = numDims[0]; + + af::array img = af::loadImage(string(TEST_DIR"/imageio/color_seq_16.png").c_str(), true); + ASSERT_EQ(img.type(), f32); // loadImage should always return float + + // Get result + float *imgData = new float[dims.elements()]; + img.host((void*)imgData); + + // Compare result + size_t nElems = in[0].size(); + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(in[0][elIter], imgData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] imgData; +} + +TEST(ImageIO, SaveImage16CPP) +{ + if (noImageIOTests()) return; + + af::dim4 dims(16, 24, 3); + + af::array input = af::randu(dims, u16); + af::array input_255 = (input / 257).as(u16); + + af::saveImage("saveImage16CPP.png", input); + + af::array img = af::loadImage("saveImage16CPP.png", true); + ASSERT_EQ(img.type(), f32); // loadImage should always return float + + ASSERT_FALSE(af::anyTrue(abs(img - input_255))); +} + +//////////////////////////////////////////////////////////////////////////////// +// Image IO Native Tests +//////////////////////////////////////////////////////////////////////////////// + +template +void loadImageNativeCPPTest(string pTestFile, string pImageFile) +{ + if (noImageIOTests()) return; + + vector numDims; + + vector > in; + vector > tests; + readTests(pTestFile,numDims,in,tests); + + af::dim4 dims = numDims[0]; + af::array img = af::loadImageNative(pImageFile.c_str()); + ASSERT_EQ(img.type(), (af_dtype)af::dtype_traits::af_type); + + // Get result + T *imgData = new T[dims.elements()]; + img.host((void*)imgData); + + // Compare result + size_t nElems = in[0].size(); + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(in[0][elIter], imgData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] imgData; +} + +TEST(ImageIONative, LoadImageNative8CPP) +{ + loadImageNativeCPPTest(string(TEST_DIR"/imageio/color_small.test"), + string(TEST_DIR"/imageio/color_small.png")); +} + +TEST(ImageIONative, LoadImageNative16SmallCPP) +{ + loadImageNativeCPPTest(string(TEST_DIR"/imageio/color_small_16.test"), + string(TEST_DIR"/imageio/color_small_16.png")); +} + +TEST(ImageIONative, LoadImageNative16ColorCPP) +{ + loadImageNativeCPPTest(string(TEST_DIR"/imageio/color_seq_16.test"), + string(TEST_DIR"/imageio/color_seq_16.png")); +} + +TEST(ImageIONative, LoadImageNative16GrayCPP) +{ + loadImageNativeCPPTest(string(TEST_DIR"/imageio/gray_seq_16.test"), + string(TEST_DIR"/imageio/gray_seq_16.png")); +} + +template +void saveLoadImageNativeCPPTest(af::dim4 dims) +{ + if (noImageIOTests()) return; + + af::array input = af::randu(dims, (af_dtype)af::dtype_traits::af_type); + + af::saveImageNative("saveImageNative.png", input); + + af::array loaded = af::loadImageNative("saveImageNative.png"); + ASSERT_EQ(loaded.type(), input.type()); + + ASSERT_FALSE(af::anyTrue(input - loaded)); +} + +TEST(ImageIONative, SaveLoadImageNative8CPP) +{ + saveLoadImageNativeCPPTest(af::dim4(480, 720, 3, 1)); +} + +TEST(ImageIONative, SaveLoadImageNative16SmallCPP) +{ + saveLoadImageNativeCPPTest(af::dim4(8, 12, 3, 1)); +} + +TEST(ImageIONative, SaveLoadImageNative16ColorCPP) +{ + saveLoadImageNativeCPPTest(af::dim4(480, 720, 3, 1)); +} + +TEST(ImageIONative, SaveLoadImageNative16GrayCPP) +{ + saveLoadImageNativeCPPTest(af::dim4(24, 32, 1, 1)); +} From 968ae4e80ce8e6263fdc3f4381ae8b895df44bc4 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 11 Jan 2016 17:11:44 -0500 Subject: [PATCH 186/288] Handle CUDA devices locked in exclusive mode * When the default device 0 is exclusively locked, ArrayFire will try to pick a different device * When the user uses setDevice to set a device that is locked, then ArrayFire will error out * Handle such a case when freeing memory in memory manager destructor Signed-off-by: Shehzan Mohammed --- src/backend/cuda/err_cuda.hpp | 37 +++++++++++----------- src/backend/cuda/memory.cpp | 12 ++++++-- src/backend/cuda/platform.cpp | 58 +++++++++++++++++++++++++++++------ 3 files changed, 77 insertions(+), 30 deletions(-) diff --git a/src/backend/cuda/err_cuda.hpp b/src/backend/cuda/err_cuda.hpp index a975fb5336..dd87bdfc2b 100644 --- a/src/backend/cuda/err_cuda.hpp +++ b/src/backend/cuda/err_cuda.hpp @@ -17,22 +17,23 @@ __AF_FILENAME__, __LINE__, "CUDA"); \ } while(0) -#define CUDA_CHECK(fn) do { \ - cudaError_t _cuda_error = fn; \ - if (_cuda_error != cudaSuccess) { \ - char cuda_err_msg[1024]; \ - snprintf(cuda_err_msg, \ - sizeof(cuda_err_msg), \ - "CUDA Error (%d): %s\n", \ - (int)(_cuda_error), \ - cudaGetErrorString( \ - cudaGetLastError())); \ - \ - if (_cuda_error == cudaErrorMemoryAllocation) { \ - AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM); \ - } else { \ - AF_ERROR(cuda_err_msg, \ - AF_ERR_INTERNAL); \ - } \ - } \ +#define CUDA_CHECK(fn) do { \ + cudaError_t _cuda_error = fn; \ + if (_cuda_error != cudaSuccess) { \ + char cuda_err_msg[1024]; \ + snprintf(cuda_err_msg, \ + sizeof(cuda_err_msg), \ + "CUDA Error (%d): %s\n", \ + (int)(_cuda_error), \ + cudaGetErrorString( \ + cudaGetLastError())); \ + \ + if (_cuda_error == cudaErrorMemoryAllocation) { \ + AF_ERROR(cuda_err_msg, AF_ERR_NO_MEM); \ + } else if (_cuda_error == cudaErrorDevicesUnavailable) {\ + AF_ERROR(cuda_err_msg, AF_ERR_DRIVER); \ + } else { \ + AF_ERROR(cuda_err_msg, AF_ERR_INTERNAL); \ + } \ + } \ } while(0) diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 0e3fb5afde..20e25475cf 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -45,8 +45,16 @@ class MemoryManager : public common::MemoryManager { common::lock_guard_t lock(this->memory_mutex); for (int n = 0; n < getDeviceCount(); n++) { - cuda::setDevice(n); - this->garbageCollect(); + try { + cuda::setDevice(n); + this->garbageCollect(); + } catch(AfError err) { + if(err.getError() == AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable + continue; + } else { + throw err; + } + } } } }; diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 72fc0bc75d..6919a04158 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -386,20 +386,58 @@ void DeviceManager::sortDevices(sort_mode mode) int DeviceManager::setActiveDevice(int device, int nId) { - if(device > (int)cuDevices.size()) { - return -1; - } else { - int old = activeDev; - if(nId == -1) nId = getDeviceNativeId(device); - CUDA_CHECK(cudaSetDevice(nId)); - activeDev = device; + static bool first = true; - if(!streams[device]) { - CUDA_CHECK(cudaStreamCreate(&streams[device])); - } + int numDevices = cuDevices.size(); + + if(device > numDevices) return -1; + int old = activeDev; + if(nId == -1) nId = getDeviceNativeId(device); + CUDA_CHECK(cudaSetDevice(nId)); + cudaError_t err = cudaStreamCreate(&streams[device]); + activeDev = device; + + if (err == cudaSuccess) return old; + + // Comes when user sets device + // If success, return. Else throw error + if (!first) { + CUDA_CHECK(err); return old; } + + // Comes only when first is true. Set it to false + first = false; + + while(device < numDevices) { + // Check for errors other than DevicesUnavailable + // If success, return. Else throw error + // If DevicesUnavailable, try other devices (while loop below) + if (err != cudaErrorDevicesUnavailable) { + CUDA_CHECK(err); + activeDev = device; + return old; + } + cudaGetLastError(); // Reset error stack + printf("Warning: Device %d is unavailable. Incrementing to next device \n", device); + + // Comes here is the device is in exclusive mode or + // otherwise fails streamCreate with this error. + // All other errors will error out + device++; + + // Can't call getNativeId here as it will cause an infinite loop with the constructor + nId = cuDevices[device].nativeId; + + CUDA_CHECK(cudaSetDevice(nId)); + err = cudaStreamCreate(&streams[device]); + } + + // If all devices fail with DevicesUnavailable, then throw this error + CUDA_CHECK(err); + + return old; } void sync(int device) From cc9018e402e2370438c1558c19e7ff21447f36d7 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 12 Jan 2016 10:53:37 -0500 Subject: [PATCH 187/288] Add try/catch around cuda::setDevice in Pinned Memory Manager --- src/backend/cuda/memory.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 20e25475cf..69eb8f8895 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -70,8 +70,16 @@ class MemoryManagerPinned : public common::MemoryManager { common::lock_guard_t lock(this->memory_mutex); for (int n = 0; n < getDeviceCount(); n++) { - cuda::setDevice(n); - this->garbageCollect(); + try { + cuda::setDevice(n); + this->garbageCollect(); + } catch(AfError err) { + if(err.getError() == AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable + continue; + } else { + throw err; + } + } } } }; From 904d3e0b8d8d85f07d650010662ba9cebf1c0c6b Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 12 Jan 2016 11:53:14 -0500 Subject: [PATCH 188/288] Using device independent vector for cuda Pinned Memory Manager --- src/api/c/imageio.cpp | 5 +++-- src/backend/cuda/memory.cpp | 21 +++++++-------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp index 5471305cee..748ddbc58e 100644 --- a/src/api/c/imageio.cpp +++ b/src/api/c/imageio.cpp @@ -301,13 +301,14 @@ af_err af_save_image(const char* filename, const af_array in_) free_in = true; } else if(max_real < 256) { in = in_; - } - else if (max_real < 65536) { + } else if (max_real < 65536) { af_array c255 = 0; AF_CHECK(af_constant(&c255, 257.0, info.ndims(), info.dims().get(), f32)); AF_CHECK(af_div(&in, in_, c255, false)); AF_CHECK(af_release_array(c255)); free_in = true; + } else { + in = in_; } // FI = row major | AF = column major diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 69eb8f8895..15786d9498 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -59,6 +59,10 @@ class MemoryManager : public common::MemoryManager } }; +// CUDA Pinned Memory does not depend on device +// So we pass 1 as numDevices to the constructor so that it creates 1 vector +// of memory_info +// When allocating and freeing, it doesn't really matter which device is active class MemoryManagerPinned : public common::MemoryManager { int getActiveDeviceId(); @@ -69,18 +73,7 @@ class MemoryManagerPinned : public common::MemoryManager ~MemoryManagerPinned() { common::lock_guard_t lock(this->memory_mutex); - for (int n = 0; n < getDeviceCount(); n++) { - try { - cuda::setDevice(n); - this->garbageCollect(); - } catch(AfError err) { - if(err.getError() == AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable - continue; - } else { - throw err; - } - } - } + this->garbageCollect(); } }; @@ -116,11 +109,11 @@ static MemoryManager &getMemoryManager() int MemoryManagerPinned::getActiveDeviceId() { - return cuda::getActiveDeviceId(); + return 0; // pinned uses a single vector } MemoryManagerPinned::MemoryManagerPinned() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) + common::MemoryManager(1, MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) {} void *MemoryManagerPinned::nativeAlloc(const size_t bytes) From a8b831b5022c6c8a840fccbaa0d5e12107aecd81 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 13 Jan 2016 11:15:15 -0500 Subject: [PATCH 189/288] Cleanup for opencl-cpu offload --- src/backend/cuda/blas.cpp | 22 ---------------------- src/backend/opencl/Array.hpp | 21 +++------------------ src/backend/opencl/blas.cpp | 27 --------------------------- test/backend.cpp | 16 +++++++++++----- test/blas.cpp | 1 - 5 files changed, 14 insertions(+), 73 deletions(-) diff --git a/src/backend/cuda/blas.cpp b/src/backend/cuda/blas.cpp index 1e5dd5de39..9d3b9ca7b7 100644 --- a/src/backend/cuda/blas.cpp +++ b/src/backend/cuda/blas.cpp @@ -200,28 +200,6 @@ Array matmul(const Array &lhs, const Array &rhs, } -// Keeping this around for future reference -//template -//Array dot_(const Array &lhs, const Array &rhs, -// af_mat_prop optLhs, af_mat_prop optRhs) -//{ -// int N = lhs.dims()[0]; -// -// T out; -// -// CUBLAS_CHECK((dot_func()( -// getHandle(), -// N, -// lhs.get(), lhs.strides()[0], -// rhs.get(), rhs.strides()[0], -// &out))); -// -// if(both_conjugate) -// return createValueArray(af::dim4(1), conj(out)); -// else -// return createValueArray(af::dim4(1), out); -//} - template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index ce9c3c7fbb..2793d5e099 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -210,17 +210,6 @@ namespace opencl JIT::Node_ptr getNode() const; - private: - bool is_const() const - { - return true; - } - - bool is_const() - { - return false; - } - public: std::shared_ptr getMappedPtr() const { @@ -237,13 +226,9 @@ namespace opencl T *ptr = nullptr; try { if(ptr == nullptr) { - if(is_const()) { - ptr = (T*)getQueue().enqueueMapBuffer(*const_cast(get()), true, CL_MAP_READ, - getOffset(), getDataDims().elements() * sizeof(T)); - } else { - ptr = (T*)getQueue().enqueueMapBuffer(*(get()), true, CL_MAP_READ|CL_MAP_WRITE, - getOffset(), getDataDims().elements() * sizeof(T)); - } + ptr = (T*)getQueue().enqueueMapBuffer(*const_cast(get()), + true, CL_MAP_READ|CL_MAP_WRITE, + getOffset(), getDataDims().elements() * sizeof(T)); } } catch(cl::Error err) { CL_TO_AF_ERROR(err); diff --git a/src/backend/opencl/blas.cpp b/src/backend/opencl/blas.cpp index 15e2373783..97a5c1ab70 100644 --- a/src/backend/opencl/blas.cpp +++ b/src/backend/opencl/blas.cpp @@ -177,33 +177,6 @@ Array matmul(const Array &lhs, const Array &rhs, return out; } -// Keeping this around for future reference -//template -//Array dot_(const Array &lhs, const Array &rhs, -// af_mat_prop optLhs, af_mat_prop optRhs) -//{ -// initBlas(); -// -// int N = lhs.dims()[0]; -// dot_func dot; -// cl::Event event; -// Array out = createEmptyArray(af::dim4(1)); -// cl::Buffer scratch(getContext(), CL_MEM_READ_WRITE, sizeof(T) * N); -// CLBLAS_CHECK( -// dot(N, -// (*out.get())(), out.getOffset(), -// (*lhs.get())(), lhs.getOffset(), lhs.strides()[0], -// (*rhs.get())(), rhs.getOffset(), rhs.strides()[0], -// scratch(), -// 1, &getQueue()(), 0, nullptr, &event()) -// ); -// -// if(both_conjugate) -// transpose_inplace(out, true); -// -// return out; -//} - template Array dot(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) diff --git a/test/backend.cpp b/test/backend.cpp index 4bb5cdf7fe..78b64309db 100644 --- a/test/backend.cpp +++ b/test/backend.cpp @@ -21,11 +21,8 @@ using std::string; using std::vector; -const char *getActiveBackendString() +const char *getActiveBackendString(af_backend active) { - af_backend active = (af_backend)0; - af_get_active_backend(&active); - switch(active) { case AF_BACKEND_CPU : return "AF_BACKEND_CPU"; case AF_BACKEND_CUDA : return "AF_BACKEND_CUDA"; @@ -39,11 +36,20 @@ void testFunction() { af_info(); - printf("Active Backend Enum = %s\n", getActiveBackendString()); + af_backend activeBackend = (af_backend)0; + af_get_active_backend(&activeBackend); + + printf("Active Backend Enum = %s\n", getActiveBackendString(activeBackend)); af_array outArray = 0; dim_t dims[] = {32, 32}; ASSERT_EQ(AF_SUCCESS, af_randu(&outArray, 2, dims, (af_dtype) af::dtype_traits::af_type)); + + // Verify backends returned by array and by function are the same + af_backend arrayBackend = (af_backend)0; + af_get_backend_id(&arrayBackend, outArray); + ASSERT_EQ(arrayBackend, activeBackend); + // cleanup if(outArray != 0) ASSERT_EQ(AF_SUCCESS, af_release_array(outArray)); } diff --git a/test/blas.cpp b/test/blas.cpp index b5d92f1073..507cc6dc7b 100644 --- a/test/blas.cpp +++ b/test/blas.cpp @@ -36,7 +36,6 @@ template void MatMulCheck(string TestFile) { if (noDoubleTests()) return; - af::info(); using std::vector; vector numDims; From f6e309bfbc9a0571ae805576acca0a4e3e0c1d9b Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 12 Jan 2016 17:43:20 -0500 Subject: [PATCH 190/288] Clean up cusolver finding in cmake --- src/backend/cuda/CMakeLists.txt | 52 +++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index 601e6c9022..4c74070492 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -70,7 +70,30 @@ ENDIF() ADD_DEFINITIONS(-DAF_CUDA) -IF(${CUDA_VERSION_MAJOR} LESS 7 OR ${CUDA_COMPUTE_53}) +# CMake 3.2 Adds CUDA_cusolver_LIBRARY variable to FindCUDA +# Older version, use FIND_LIBRARY +IF(CMAKE_VERSION VERSION_LESS 3.2) + IF(${CUDA_cusolver_LIBRARY} MATCHES " ") + UNSET(CUDA_cusolver_LIBRARY CACHE) # When going from higher version to lower version + ENDIF() + FIND_LIBRARY ( + CUDA_cusolver_LIBRARY + NAMES "cusolver" + PATHS ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES "lib64" "lib/x64" "lib" + DOC "CUDA cusolver Library" + NO_DEFAULT_PATH + ) +ENDIF(CMAKE_VERSION VERSION_LESS 3.2) + +IF(${CUDA_VERSION_MAJOR} LESS 7 AND CUDA_cusolver_LIBRARY) + UNSET(CUDA_cusolver_LIBRARY CACHE) # Failsafe when going from higher version to lower version +ENDIF() + +IF(CUDA_cusolver_LIBRARY) + MESSAGE(STATUS "CUDA cusolver library available in CUDA Version ${CUDA_VERSION_STRING}") + ADD_DEFINITIONS(-DWITH_CUDA_LINEAR_ALGEBRA) +ELSE(CUDA_cusolver_LIBRARY) # Use CPU Lapack as fallback? OPTION(CUDA_LAPACK_CPU_FALLBACK "Use CPU LAPACK as fallback for CUDA LAPACK when cusolver is not available" OFF) MARK_AS_ADVANCED(CUDA_LAPACK_CPU_FALLBACK) @@ -96,24 +119,8 @@ IF(${CUDA_VERSION_MAJOR} LESS 7 OR ${CUDA_COMPUTE_53}) ELSE() MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. Linear Algebra will not be available.") ENDIF() - IF(CMAKE_VERSION VERSION_LESS 3.2) - SET(CUDA_cusolver_LIBRARY) - MARK_AS_ADVANCED(CUDA_cusolver_LIBRARY) - ENDIF(CMAKE_VERSION VERSION_LESS 3.2) -ELSE(${CUDA_VERSION_MAJOR} LESS 7 OR ${CUDA_COMPUTE_53}) - MESSAGE(STATUS "CUDA cusolver library available in CUDA Version ${CUDA_VERSION_STRING}") - ADD_DEFINITIONS(-DWITH_CUDA_LINEAR_ALGEBRA) - IF(CMAKE_VERSION VERSION_LESS 3.2) - FIND_LIBRARY( - CUDA_cusolver_LIBRARY - NAMES "cusolver" - PATHS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES "lib64" "lib/x64" "lib" - DOC "CUDA cusolver Library" - NO_DEFAULT_PATH - ) - ENDIF(CMAKE_VERSION VERSION_LESS 3.2) -ENDIF(${CUDA_VERSION_MAJOR} LESS 7 OR ${CUDA_COMPUTE_53}) + UNSET(CUDA_cusolver_LIBRARY CACHE) # Failsafe when going from higher version to lower version +ENDIF(CUDA_cusolver_LIBRARY) INCLUDE_DIRECTORIES( ${CMAKE_INCLUDE_PATH} @@ -310,7 +317,6 @@ ADD_DEPENDENCIES(afcuda ${ptx_targets}) TARGET_LINK_LIBRARIES(afcuda PRIVATE ${CUDA_CUBLAS_LIBRARIES} PRIVATE ${CUDA_LIBRARIES} - PRIVATE ${CUDA_cusolver_LIBRARY} PRIVATE ${FreeImage_LIBS} PRIVATE ${CUDA_CUFFT_LIBRARIES} PRIVATE ${CUDA_NVVM_LIBRARIES} @@ -320,8 +326,10 @@ IF(FORGE_FOUND) TARGET_LINK_LIBRARIES(afcuda PRIVATE ${FORGE_LIBRARIES}) ENDIF() -IF(CUDA_LAPACK_CPU_FALLBACK) - TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES}) +IF(CUDA_cusolver_LIBRARY) + TARGET_LINK_LIBRARIES(afcuda PRIVATE ${CUDA_cusolver_LIBRARY}) +ELSEIF(CUDA_LAPACK_CPU_FALLBACK) + TARGET_LINK_LIBRARIES(afcuda PRIVATE ${LAPACK_LIBRARIES}) ENDIF() SET_TARGET_PROPERTIES(afcuda PROPERTIES From 3941550c448246601d21681202614067401ddad5 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 14 Jan 2016 13:06:41 -0500 Subject: [PATCH 191/288] Move asserts inside try/catch in indexer functions in util --- src/api/c/util.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/api/c/util.cpp b/src/api/c/util.cpp index cc9a07ac4f..9b16fe98df 100644 --- a/src/api/c/util.cpp +++ b/src/api/c/util.cpp @@ -30,45 +30,45 @@ af_err af_create_indexers(af_index_t** indexers) af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim) { - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(1, (idx!=NULL)); - ARG_ASSERT(2, (dim>=0 && dim<=3)); try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(1, (idx!=NULL)); + ARG_ASSERT(2, (dim>=0 && dim<=3)); indexer[dim].idx.arr = idx; indexer[dim].isBatch = false; indexer[dim].isSeq = false; } CATCHALL - return AF_SUCCESS; + return AF_SUCCESS; } af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch) { - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(1, (idx!=NULL)); - ARG_ASSERT(2, (dim>=0 && dim<=3)); try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(1, (idx!=NULL)); + ARG_ASSERT(2, (dim>=0 && dim<=3)); indexer[dim].idx.seq = *idx; indexer[dim].isBatch = is_batch; indexer[dim].isSeq = true; } CATCHALL - return AF_SUCCESS; + return AF_SUCCESS; } af_err af_set_seq_param_indexer(af_index_t* indexer, const double begin, const double end, const double step, const dim_t dim, const bool is_batch) { - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(4, (dim>=0 && dim<=3)); try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(4, (dim>=0 && dim<=3)); indexer[dim].idx.seq = af_make_seq(begin, end, step); indexer[dim].isBatch = is_batch; indexer[dim].isSeq = true; } CATCHALL - return AF_SUCCESS; + return AF_SUCCESS; } af_err af_release_indexers(af_index_t* indexers) From 735b66b5916a1b4ae79e179de2f06924a397c5bf Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 14 Jan 2016 17:20:26 -0500 Subject: [PATCH 192/288] Fix OpenCL-CPU offload when OpenCL is built without lapack --- src/backend/opencl/blas.cpp | 4 ++++ src/backend/opencl/cpu/cpu_blas.cpp | 2 ++ src/backend/opencl/cpu/cpu_cholesky.cpp | 2 ++ src/backend/opencl/cpu/cpu_helper.hpp | 29 +++++++++++++++++-------- src/backend/opencl/cpu/cpu_inverse.cpp | 2 ++ src/backend/opencl/cpu/cpu_lu.cpp | 2 ++ src/backend/opencl/cpu/cpu_qr.cpp | 2 ++ src/backend/opencl/cpu/cpu_solve.cpp | 2 ++ src/backend/opencl/cpu/cpu_svd.cpp | 2 ++ src/backend/opencl/cpu/cpu_triangle.hpp | 2 ++ 10 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/backend/opencl/blas.cpp b/src/backend/opencl/blas.cpp index 97a5c1ab70..365e6e5680 100644 --- a/src/backend/opencl/blas.cpp +++ b/src/backend/opencl/blas.cpp @@ -23,7 +23,9 @@ #include #include +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include +#endif namespace opencl { @@ -118,9 +120,11 @@ template Array matmul(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) if(OpenCLCPUOffload()) { return cpu::matmul(lhs, rhs, optLhs, optRhs); } +#endif initBlas(); clblasTranspose lOpts = toClblasTranspose(optLhs); diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp index 1ff7e145d6..fe6fe9959a 100644 --- a/src/backend/opencl/cpu/cpu_blas.cpp +++ b/src/backend/opencl/cpu/cpu_blas.cpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include #include @@ -206,3 +207,4 @@ INSTANTIATE_BLAS(cdouble) } } +#endif diff --git a/src/backend/opencl/cpu/cpu_cholesky.cpp b/src/backend/opencl/cpu/cpu_cholesky.cpp index bd871d7518..9acbcc4fad 100644 --- a/src/backend/opencl/cpu/cpu_cholesky.cpp +++ b/src/backend/opencl/cpu/cpu_cholesky.cpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include #include @@ -80,3 +81,4 @@ INSTANTIATE_CH(cdouble) } } +#endif diff --git a/src/backend/opencl/cpu/cpu_helper.hpp b/src/backend/opencl/cpu/cpu_helper.hpp index d407bb83cc..cbdc470e19 100644 --- a/src/backend/opencl/cpu/cpu_helper.hpp +++ b/src/backend/opencl/cpu/cpu_helper.hpp @@ -17,6 +17,11 @@ #include #include +//********************************************************/ +// LAPACK +//********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) + #define lapack_complex_float opencl::cfloat #define lapack_complex_double opencl::cdouble #define LAPACK_PREFIX LAPACKE_ @@ -31,13 +36,26 @@ #define AF_LAPACK_COL_MAJOR 0 #else #ifdef USE_MKL - #include #include + #else + #include + #endif +#endif //OS + +#endif // WITH_OPENCL_LINEAR_ALGEBRA + +//********************************************************/ +// BLAS +//********************************************************/ +#ifdef __APPLE__ + #include +#else + #ifdef USE_MKL + #include #else extern "C" { #include } - #include #endif #endif @@ -53,11 +71,4 @@ typedef int blasint; #endif -namespace opencl -{ -namespace cpu -{ -} -} - #endif diff --git a/src/backend/opencl/cpu/cpu_inverse.cpp b/src/backend/opencl/cpu/cpu_inverse.cpp index fee171929a..4f73a80707 100644 --- a/src/backend/opencl/cpu/cpu_inverse.cpp +++ b/src/backend/opencl/cpu/cpu_inverse.cpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include #include @@ -72,3 +73,4 @@ INSTANTIATE(cdouble) } } +#endif diff --git a/src/backend/opencl/cpu/cpu_lu.cpp b/src/backend/opencl/cpu/cpu_lu.cpp index 3eb574e743..e0234fb7de 100644 --- a/src/backend/opencl/cpu/cpu_lu.cpp +++ b/src/backend/opencl/cpu/cpu_lu.cpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include #include @@ -174,3 +175,4 @@ INSTANTIATE_LU(cdouble) } } +#endif diff --git a/src/backend/opencl/cpu/cpu_qr.cpp b/src/backend/opencl/cpu/cpu_qr.cpp index 32eca92963..737a7aec2f 100644 --- a/src/backend/opencl/cpu/cpu_qr.cpp +++ b/src/backend/opencl/cpu/cpu_qr.cpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include #include @@ -114,3 +115,4 @@ INSTANTIATE_QR(cdouble) } } +#endif diff --git a/src/backend/opencl/cpu/cpu_solve.cpp b/src/backend/opencl/cpu/cpu_solve.cpp index 9e4f0932ac..1bb72f8768 100644 --- a/src/backend/opencl/cpu/cpu_solve.cpp +++ b/src/backend/opencl/cpu/cpu_solve.cpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include #include @@ -172,3 +173,4 @@ INSTANTIATE_SOLVE(cdouble) } } +#endif diff --git a/src/backend/opencl/cpu/cpu_svd.cpp b/src/backend/opencl/cpu/cpu_svd.cpp index c53df8ae78..3608bf69ce 100644 --- a/src/backend/opencl/cpu/cpu_svd.cpp +++ b/src/backend/opencl/cpu/cpu_svd.cpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #include #include #include @@ -108,3 +109,4 @@ namespace cpu INSTANTIATE_SVD(cdouble, double) } } +#endif diff --git a/src/backend/opencl/cpu/cpu_triangle.hpp b/src/backend/opencl/cpu/cpu_triangle.hpp index f953d58507..e705420582 100644 --- a/src/backend/opencl/cpu/cpu_triangle.hpp +++ b/src/backend/opencl/cpu/cpu_triangle.hpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#if defined(WITH_OPENCL_LINEAR_ALGEBRA) #ifndef CPU_LAPACK_TRIANGLE #define CPU_LAPACK_TRIANGLE @@ -53,3 +54,4 @@ void triangle(T *o, const T *i, const dim4 odm, const dim4 ost, const dim4 ist) } #endif +#endif From 323bf75b44731c11eeee3c7ba866e6803bfab9b4 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 15 Jan 2016 23:41:34 -0500 Subject: [PATCH 193/288] Added tranform coordinates functionality --- include/af/image.h | 25 +++++++ src/api/c/transform_coordinates.cpp | 96 +++++++++++++++++++++++++++ src/api/cpp/transform_coordinates.cpp | 24 +++++++ 3 files changed, 145 insertions(+) create mode 100644 src/api/c/transform_coordinates.cpp create mode 100644 src/api/cpp/transform_coordinates.cpp diff --git a/include/af/image.h b/include/af/image.h index ad56cfc081..d25f64f058 100644 --- a/include/af/image.h +++ b/include/af/image.h @@ -223,6 +223,18 @@ AFAPI array rotate(const array& in, const float theta, const bool crop=true, con */ AFAPI array transform(const array& in, const array& transform, const dim_t odim0 = 0, const dim_t odim1 = 0, const interpType method=AF_INTERP_NEAREST, const bool inverse=true); +/** + C++ Interface for transforming coordinates + + \param[in] tf is transformation matrix + \param[in] d0 is the first input dimension + \param[in] d1 is the second input dimension + \return the transformed coordinates + + \ingroup transform_func_coordinates +*/ +AFAPI array transformCoordinates(const array& tf, const float d0, const float d1); + /** C++ Interface for translating an image @@ -853,6 +865,19 @@ extern "C" { const dim_t odim0, const dim_t odim1, const af_interp_type method, const bool inverse); + /** + C Interface for transforming an image + C++ Interface for transforming coordinates + + \param[out] out the transformed coordinates + \param[in] tf is transformation matrix + \param[in] d0 is the first input dimension + \param[in] d1 is the second input dimension + + \ingroup transform_func_coordinates + */ + AFAPI af_err af_transform_coordinates(af_array *out, const af_array tf, const float d0, const float d1); + /** C Interface for rotating an image diff --git a/src/api/c/transform_coordinates.cpp b/src/api/c/transform_coordinates.cpp new file mode 100644 index 0000000000..79b448db5d --- /dev/null +++ b/src/api/c/transform_coordinates.cpp @@ -0,0 +1,96 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using af::dim4; +using namespace detail; + +template +static af_array transform_coordinates(const af_array& tf, const float d0, const float d1) +{ + dim_t in_dims[2] = { 4, 3 }; + T h_in[4*3] = { (T)0, (T)0, (T)d1, (T)d1, + (T)0, (T)d0, (T)d0, (T)0, + (T)1, (T)1, (T)1, (T)1 }; + + af_array in = 0; + af_array w = 0; + af_array tmp = 0; + af_array xt = 0; + af_array yt = 0; + af_array t = 0; + + AF_CHECK(af_create_array(&in, h_in, 2, in_dims, (af_dtype) af::dtype_traits::af_type)); + + af_array tfIdx = 0; + af_index_t tfIndexs[2]; + tfIndexs[0].isSeq = true; + tfIndexs[1].isSeq = true; + tfIndexs[0].idx.seq = af_make_seq(0, 2, 1); + tfIndexs[1].idx.seq = af_make_seq(2, 2, 1); + AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs)); + + AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE)); + T h_w[4] = { 1, 1, 1, 1 }; + dim_t w_dims = 4; + AF_CHECK(af_create_array(&w, h_w, 1, &w_dims, (af_dtype) af::dtype_traits::af_type)); + AF_CHECK(af_div(&w, w, tmp, false)); + + tfIndexs[1].idx.seq = af_make_seq(0, 0, 1); + AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs)); + AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE)); + AF_CHECK(af_mul(&xt, tmp, w, false)); + + tfIndexs[1].idx.seq = af_make_seq(1, 1, 1); + AF_CHECK(af_index_gen(&tfIdx, tf, 2, tfIndexs)); + AF_CHECK(af_matmul(&tmp, in, tfIdx, AF_MAT_NONE, AF_MAT_NONE)); + AF_CHECK(af_mul(&yt, tmp, w, false)); + + AF_CHECK(af_join(&t, 1, xt, yt)); + + AF_CHECK(af_release_array(w)); + AF_CHECK(af_release_array(tmp)); + AF_CHECK(af_release_array(xt)); + AF_CHECK(af_release_array(yt)); + + return t; +} + +af_err af_transform_coordinates(af_array *out, const af_array tf, const float d0, const float d1) +{ + try { + ArrayInfo tfInfo = getInfo(tf); + dim4 tfDims = tfInfo.dims(); + ARG_ASSERT(1, (tfDims[0]==3 && tfDims[1]==3 && tfDims.ndims()==2)); + + af_array output; + af_dtype type = tfInfo.getType(); + switch(type) { + case f32: output = transform_coordinates(tf, d0, d1); break; + case f64: output = transform_coordinates(tf, d0, d1); break; + default : TYPE_ERROR(1, type); + } + std::swap(*out, output); + } + CATCHALL; + + return AF_SUCCESS; +} diff --git a/src/api/cpp/transform_coordinates.cpp b/src/api/cpp/transform_coordinates.cpp new file mode 100644 index 0000000000..4d896e7194 --- /dev/null +++ b/src/api/cpp/transform_coordinates.cpp @@ -0,0 +1,24 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include "error.hpp" + +namespace af +{ + +array transformCoordinates(const array& tf, const float d0, const float d1) +{ + af_array out = 0; + AF_THROW(af_transform_coordinates(&out, tf.get(), d0, d1)); + return array(out); +} + +} From ba483f19ee5500c1ce0a3f820989cdb49322084c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 15 Jan 2016 23:42:00 -0500 Subject: [PATCH 194/288] Added transform coordinates to unified backend --- src/api/unified/image.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/api/unified/image.cpp b/src/api/unified/image.cpp index 7b1159516c..0ee211d585 100644 --- a/src/api/unified/image.cpp +++ b/src/api/unified/image.cpp @@ -74,6 +74,13 @@ af_err af_transform(af_array *out, const af_array in, const af_array transform, return CALL(out, in, transform, odim0, odim1, method, inverse); } +af_err af_transform_coordinates(af_array *out, const af_array tf, + const float d0, const float d1) +{ + CHECK_ARRAYS(tf); + return CALL(out, tf, d0, d1); +} + af_err af_rotate(af_array *out, const af_array in, const float theta, const bool crop, const af_interp_type method) { From 3522f80c5d5d861788d298c75faa70366d65b89f Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 15 Jan 2016 23:42:32 -0500 Subject: [PATCH 195/288] Added transform coordinates documentation --- docs/details/image.dox | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/details/image.dox b/docs/details/image.dox index 288e4f6b0f..ef6d12a4f0 100644 --- a/docs/details/image.dox +++ b/docs/details/image.dox @@ -700,6 +700,18 @@ AF_INTERP_LOWER are allowed. Affine transforms can be used for various purposes. \ref af::translate, \ref af::scale and \ref af::skew are specializations of the transform function. + +\defgroup transform_func_coordinates transformcoordinates +\ingroup transform_mat + +Transform input coordinates + +The transform function uses a perspective transform matrix to transform input +coordinates (given as two dimensions) into a coordinates matrix. + +The output is a 4x2 matrix, indicating the coordinates of the 4 bidimensional +transformed points. + ======================================================================= \defgroup image_func_sat SAT From 7f3e2159537da233adcd3f66492e7e0cf11814fe Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 15 Jan 2016 23:42:52 -0500 Subject: [PATCH 196/288] Added transform coordinates unit tests --- test/transform_coordinates.cpp | 118 +++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 test/transform_coordinates.cpp diff --git a/test/transform_coordinates.cpp b/test/transform_coordinates.cpp new file mode 100644 index 0000000000..7f1ac4e893 --- /dev/null +++ b/test/transform_coordinates.cpp @@ -0,0 +1,118 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +using std::vector; +using std::string; +using std::cout; +using std::endl; + +template +class TransformCoordinates : public ::testing::Test +{ + public: + virtual void SetUp() {} +}; + +typedef ::testing::Types TestTypes; + +TYPED_TEST_CASE(TransformCoordinates, TestTypes); + +template +void transformCoordinatesTest(string pTestFile) +{ + if (noDoubleTests()) return; + + vector inDims; + vector > in; + vector > gold; + + readTests(pTestFile, inDims, in, gold); + + af_array tfArray = 0; + af_array outArray = 0; + ASSERT_EQ(AF_SUCCESS, af_create_array(&tfArray, &(in[0].front()), inDims[0].ndims(), inDims[0].get(), (af_dtype)af::dtype_traits::af_type)); + + size_t nTests = in.size(); + + for (int test = 1; test < nTests; test++) { + dim_t d0 = (dim_t)in[test][0]; + dim_t d1 = (dim_t)in[test][1]; + + ASSERT_EQ(AF_SUCCESS, af_transform_coordinates(&outArray, tfArray, d0, d1)); + + // Get result + dim_t outEl = 0; + ASSERT_EQ(AF_SUCCESS, af_get_elements(&outEl, outArray)); + T* outData = new T[outEl]; + ASSERT_EQ(AF_SUCCESS, af_get_data_ptr((void*)outData, outArray)); + + const float thr = 1.f; + + for (size_t elIter = 0; elIter < outEl; elIter++) { + ASSERT_LE(fabs(outData[elIter] - gold[test-1][elIter]), thr) << "at: " << elIter << std::endl; + } + + delete[] outData; + } + + if(tfArray != 0) af_release_array(tfArray); + if(outArray != 0) af_release_array(outArray); +} + +TYPED_TEST(TransformCoordinates, RotateMatrix) +{ + transformCoordinatesTest(string(TEST_DIR"/transformCoordinates/rotate_matrix.test")); +} + +TYPED_TEST(TransformCoordinates, 3DMatrix) +{ + transformCoordinatesTest(string(TEST_DIR"/transformCoordinates/3d_matrix.test")); +} + +///////////////////////////////////// CPP //////////////////////////////// +// +TEST(TransformCoordinates, CPP) +{ + vector inDims; + vector > in; + vector > gold; + + readTests(TEST_DIR"/transformCoordinates/3d_matrix.test",inDims,in,gold); + + af::array tf = af::array(inDims[0][0], inDims[0][1], &(in[0].front())); + + float d0 = in[1][0]; + float d1 = in[1][1]; + + af::array out = af::transformCoordinates(tf, d0, d1); + + af::dim4 outDims = out.dims(); + + float* h_out = new float[outDims[0] * outDims[1]]; + out.host(h_out); + + const size_t n = gold[0].size(); + + const float thr = 1.f; + + for (size_t elIter = 0; elIter < n; elIter++) { + ASSERT_LE(fabs(h_out[elIter] - gold[0][elIter]), thr) << "at: " << elIter << std::endl; + } + + delete[] h_out; +} From 230c603e1a988bda7665abaabc911ec5d4241e67 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 15 Jan 2016 23:45:21 -0500 Subject: [PATCH 197/288] Updated test data --- test/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data b/test/data index d134732012..414f02d905 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit d1347320125a0315a4ef03e63630b5b3249d189d +Subproject commit 414f02d90588ec2cde177202bd340c57be6e7d9a From 3389940d894d7425bdc30561901d06042dbf2606 Mon Sep 17 00:00:00 2001 From: Umar Arshad Date: Mon, 18 Jan 2016 13:59:30 -0500 Subject: [PATCH 198/288] Fix resize unit test. --- src/backend/cpu/kernel/meanshift.hpp | 9 +++------ test/resize.cpp | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/backend/cpu/kernel/meanshift.hpp b/src/backend/cpu/kernel/meanshift.hpp index f173b2fd8b..54fb1a89bf 100644 --- a/src/backend/cpu/kernel/meanshift.hpp +++ b/src/backend/cpu/kernel/meanshift.hpp @@ -34,12 +34,9 @@ void meanShift(Array out, const Array in, const float s_sigma, const dim_t radius = std::max((int)(space_ * 1.5f), 1); const float cvar = c_sigma*c_sigma; - std::vector means; - std::vector centers; - std::vector tmpclrs; - means.reserve(channels); - centers.reserve(channels); - tmpclrs.reserve(channels); + std::vector means(channels); + std::vector centers(channels); + std::vector tmpclrs(channels); T *outData = out.get(); const T * inData = in.get(); diff --git a/test/resize.cpp b/test/resize.cpp index e0f1ea0810..6c29e61cc6 100644 --- a/test/resize.cpp +++ b/test/resize.cpp @@ -65,7 +65,7 @@ TYPED_TEST(Resize, InvalidDims) { if (noDoubleTests()) return; - vector in(8,8); + vector in(8*8); af_array inArray = 0; af_array outArray = 0; From 6a34bee575de572010abe80ddf999c82da6b91b8 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 18 Jan 2016 14:20:59 -0500 Subject: [PATCH 199/288] Compile fixes for gcc 5.3 --- src/backend/opencl/cpu/cpu_blas.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/backend/opencl/cpu/cpu_blas.cpp b/src/backend/opencl/cpu/cpu_blas.cpp index 1ff7e145d6..029421b374 100644 --- a/src/backend/opencl/cpu/cpu_blas.cpp +++ b/src/backend/opencl/cpu/cpu_blas.cpp @@ -167,9 +167,9 @@ Array matmul(const Array &lhs, const Array &rhs, using BT = typename blas_base::type; // get host pointers from mapped memory - std::shared_ptr lPtr = lhs.getMappedPtr(); - std::shared_ptr rPtr = rhs.getMappedPtr(); - std::shared_ptr oPtr = out.getMappedPtr(); + auto lPtr = lhs.getMappedPtr(); + auto rPtr = rhs.getMappedPtr(); + auto oPtr = out.getMappedPtr(); if(rDims[bColDim] == 1) { N = lDims[aColDim]; @@ -177,19 +177,19 @@ Array matmul(const Array &lhs, const Array &rhs, CblasColMajor, lOpts, lDims[0], lDims[1], alpha, - lPtr.get(), lStrides[1], - rPtr.get(), rStrides[0], + (BT*)lPtr.get(), lStrides[1], + (BT*)rPtr.get(), rStrides[0], beta, - oPtr.get(), 1); + (BT*)oPtr.get(), 1); } else { gemm_func()( CblasColMajor, lOpts, rOpts, M, N, K, alpha, - lPtr.get(), lStrides[1], - rPtr.get(), rStrides[1], + (BT*)lPtr.get(), lStrides[1], + (BT*)rPtr.get(), rStrides[1], beta, - oPtr.get(), out.dims()[0]); + (BT*)oPtr.get(), out.dims()[0]); } return out; From 46042feed38a8871ddf7bd7fb0cf3747d70b0c35 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 18 Jan 2016 15:39:33 -0500 Subject: [PATCH 200/288] Fixing compiler warnings --- src/backend/opencl/platform.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index ef9f8f63be..d7c3e1cfc0 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -181,14 +181,14 @@ static inline bool compare_default(const Device *ldev, const Device *rdev) auto lversion = ldev->getInfo(); auto rversion = rdev->getInfo(); - auto lres = (lversion[7] > rversion[7]) || + bool lres = (lversion[7] > rversion[7]) || ((lversion[7] == rversion[7]) && (lversion[9] > rversion[9])); - auto rres = (lversion[7] < rversion[7]) || + bool rres = (lversion[7] < rversion[7]) || ((lversion[7] == rversion[7]) && (lversion[9] < rversion[9])); - if (lres > 0) return true; - if (rres < 0) return false; + if (lres) return true; + if (rres) return false; } // Default crietria, sort based on memory From 5fba37c972d1a1bf92bc8bf5870a6442a1a6db32 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 18 Jan 2016 17:36:14 -0500 Subject: [PATCH 201/288] Functions to get opencl device type and platforms - Also use this mechanism for checking for particular device type / platform --- include/af/opencl.h | 77 ++++++++++++++++++++++++++++-- src/backend/opencl/magma/getrs.cpp | 4 +- src/backend/opencl/platform.cpp | 44 +++++++++++++++++ src/backend/opencl/platform.hpp | 8 ++++ src/backend/opencl/solve.cpp | 8 ++-- test/ocl_ext_context.cpp | 19 ++++++++ 6 files changed, 150 insertions(+), 10 deletions(-) diff --git a/include/af/opencl.h b/include/af/opencl.h index 88e47d2b16..16b85d763f 100644 --- a/include/af/opencl.h +++ b/include/af/opencl.h @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma once #if defined(__APPLE__) || defined(__MACOSX) #include #else @@ -19,6 +20,29 @@ extern "C" { #endif +#if AF_API_VERSION >= 33 +typedef enum +{ + AFCL_DEVICE_TYPE_CPU = CL_DEVICE_TYPE_CPU, + AFCL_DEVICE_TYPE_GPU = CL_DEVICE_TYPE_GPU, + AFCL_DEVICE_TYPE_ACC = CL_DEVICE_TYPE_ACCELERATOR, + AFCL_DEVICE_TYPE_UNKNOWN = -1 +} afcl_device_type; +#endif + +#if AF_API_VERSION >= 33 +typedef enum +{ + AFCL_PLATFORM_AMD = 0, + AFCL_PLATFORM_APPLE = 1, + AFCL_PLATFORM_INTEL = 2, + AFCL_PLATFORM_NVIDIA = 3, + AFCL_PLATFORM_BEIGNET = 4, + AFCL_PLATFORM_POCL = 5, + AFCL_PLATFORM_UNKNOWN = -1 +} afcl_platform; +#endif + /** \ingroup opencl_mat @{ @@ -110,6 +134,20 @@ AFAPI af_err afcl_set_device_context(cl_device_id dev, cl_context ctx); AFAPI af_err afcl_delete_device_context(cl_device_id dev, cl_context ctx); #endif +#if AF_API_VERSION >= 33 +/** + Get the type of the current device +*/ +AFAPI af_err afcl_get_device_type(afcl_device_type *res); +#endif + +#if AF_API_VERSION >= 33 +/** + Get the platform of the current device +*/ +AFAPI af_err afcl_get_platform(afcl_platform *res); +#endif + /** @} */ @@ -253,6 +291,38 @@ static inline void deleteDevice(cl_device_id dev, cl_context ctx) } #endif + +#if AF_API_VERSION >= 33 + typedef afcl_device_type deviceType; + typedef afcl_platform platform; +#endif + +#if AF_API_VERSION >= 33 +/** + Get the type of the current device +*/ +static inline deviceType getDeviceType() +{ + afcl_device_type res = AFCL_DEVICE_TYPE_UNKNOWN; + af_err err = afcl_get_device_type(&res); + if (err!=AF_SUCCESS) throw af::exception("Failed to get OpenCL device type"); + return res; +} +#endif + +#if AF_API_VERSION >= 33 +/** + Get the type of the current device +*/ +static inline platform getPlatform() +{ + afcl_platform res = AFCL_PLATFORM_UNKNOWN; + af_err err = afcl_get_platform(&res); + if (err!=AF_SUCCESS) throw af::exception("Failed to get OpenCL platform"); + return res; +} +#endif + /** Create an af::array object from an OpenCL cl_mem buffer @@ -369,15 +439,15 @@ static inline void deleteDevice(cl_device_id dev, cl_context ctx) return afcl::array(af::dim4(dim0, dim1, dim2, dim3), buf, type, retain); } - /** +/** @} - */ - +*/ } namespace af { +#if !defined(AF_OPENCL) template<> AFAPI cl_mem *array::device() const { cl_mem *mem = new cl_mem; @@ -385,6 +455,7 @@ template<> AFAPI cl_mem *array::device() const if (err != AF_SUCCESS) throw af::exception("Failed to get cl_mem from array object"); return mem; } +#endif } diff --git a/src/backend/opencl/magma/getrs.cpp b/src/backend/opencl/magma/getrs.cpp index 1dc106c0c5..eb28a5175a 100644 --- a/src/backend/opencl/magma/getrs.cpp +++ b/src/backend/opencl/magma/getrs.cpp @@ -61,6 +61,7 @@ #include #include #include +#include template magma_int_t magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, @@ -168,8 +169,7 @@ magma_getrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, clblasTranspose cltrans =(trans == MagmaNoTrans) ? clblasNoTrans : (trans == MagmaTrans ? clblasTrans : clblasConjTrans); - std::string pName = opencl::getPlatformName(opencl::getDevice()); - bool cond = pName.find("NVIDIA") != std::string::npos; + bool cond = opencl::getActivePlatform() == AFCL_PLATFORM_NVIDIA; cl_mem dAT = 0; if (nrhs > 1 && cond) { magma_malloc(&dAT, n * n); diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index d7c3e1cfc0..884dca14d1 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -198,6 +198,25 @@ static inline bool compare_default(const Device *ldev, const Device *rdev) return l_mem >= r_mem; } +static afcl::deviceType getDeviceTypeEnum(cl::Device dev) +{ + return (afcl::deviceType)dev.getInfo(); +} + + +static afcl::platform getPlatformEnum(cl::Device dev) +{ + std::string pname = getPlatformName(dev); + if (verify_present(pname, "AMD")) return AFCL_PLATFORM_AMD; + if (verify_present(pname, "NVIDIA")) return AFCL_PLATFORM_NVIDIA; + if (verify_present(pname, "INTEL")) return AFCL_PLATFORM_INTEL; + if (verify_present(pname, "APPLE")) return AFCL_PLATFORM_APPLE; + if (verify_present(pname, "BEIGNET")) return AFCL_PLATFORM_BEIGNET; + if (verify_present(pname, "POCL")) return AFCL_PLATFORM_POCL; + return AFCL_PLATFORM_UNKNOWN; +} + + DeviceManager::DeviceManager() : mUserDeviceOffset(0), mActiveCtxId(0), mActiveQId(0) { @@ -260,6 +279,8 @@ DeviceManager::DeviceManager() mContexts.push_back(ctx); mQueues.push_back(cq); mIsGLSharingOn.push_back(false); + mDeviceTypes.push_back(getDeviceTypeEnum(*mDevices[i])); + mPlatforms.push_back(getPlatformEnum(*mDevices[i])); } bool default_device_set = false; @@ -437,6 +458,17 @@ int getDeviceIdFromNativeId(cl_device_id id) return devId; } +int getActiveDeviceType() +{ + DeviceManager &instance = DeviceManager::getInstance(); + return instance.mDeviceTypes[instance.mActiveQId]; +} + +int getActivePlatform() +{ + DeviceManager &instance = DeviceManager::getInstance(); + return instance.mPlatforms[instance.mActiveQId]; +} const Context& getContext() { DeviceManager& devMngr = DeviceManager::getInstance(); @@ -731,6 +763,18 @@ bool synchronize_calls() { using namespace opencl; +af_err afcl_get_device_type(afcl_device_type *res) +{ + *res = (afcl_device_type)getActiveDeviceType(); + return AF_SUCCESS; +} + +af_err afcl_get_platform(afcl_platform *res) +{ + *res = (afcl_platform)getActivePlatform(); + return AF_SUCCESS; +} + af_err afcl_get_context(cl_context *ctx, const bool retain) { *ctx = getContext()(); diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 85c533fa84..d4f9f0e5ef 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -49,6 +49,9 @@ class DeviceManager friend void removeDeviceContext(cl_device_id dev, cl_context ctx); + friend int getActiveDeviceType(); + friend int getActivePlatform(); + public: static const unsigned MAX_DEVICES = 32; @@ -77,6 +80,8 @@ class DeviceManager std::vector mContexts; std::vector mQueues; std::vector mIsGLSharingOn; + std::vector mDeviceTypes; + std::vector mPlatforms; unsigned mUserDeviceOffset; unsigned mActiveCtxId; @@ -123,4 +128,7 @@ void sync(int device); bool synchronize_calls(); +int getActiveDeviceType(); +int getActivePlatform(); + } diff --git a/src/backend/opencl/solve.cpp b/src/backend/opencl/solve.cpp index 4fede07e56..93176752b5 100644 --- a/src/backend/opencl/solve.cpp +++ b/src/backend/opencl/solve.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -226,9 +227,7 @@ Array leastSquares(const Array &a, const Array &b) (*dT)(), tmp.getOffset() + NB * MN, NB, 0, queue); - - std::string pName = getPlatformName(getDevice()); - if(pName.find("NVIDIA") != std::string::npos) + if(getActivePlatform() == AFCL_PLATFORM_NVIDIA) { Array AT = transpose(A, true); cl::Buffer* AT_buf = AT.get(); @@ -268,8 +267,7 @@ Array triangleSolve(const Array &A, const Array &b, const af_mat_prop o cl_event event = 0; cl_command_queue queue = getQueue()(); - std::string pName = getPlatformName(getDevice()); - if(pName.find("NVIDIA") != std::string::npos && (options & AF_MAT_UPPER)) + if(getActivePlatform() == AFCL_PLATFORM_NVIDIA && (options & AF_MAT_UPPER)) { Array AT = transpose(A, true); diff --git a/test/ocl_ext_context.cpp b/test/ocl_ext_context.cpp index 0d4f89b3fc..e711c631e4 100644 --- a/test/ocl_ext_context.cpp +++ b/test/ocl_ext_context.cpp @@ -105,6 +105,25 @@ TEST(OCLExtContext, pop) printf("%d devices after afcl::deleteDevice\n", af::getDeviceCount()); af::info(); } + +TEST(OCLCheck, DeviceType) +{ + afcl::deviceType devType = afcl::getDeviceType(); + cl_device_type type = -100; + clGetDeviceInfo(afcl::getDeviceId(), + CL_DEVICE_TYPE, + sizeof(cl_device_type), + &type, + NULL); + ASSERT_EQ(type, (cl_device_type)devType); +} + +TEST(OCLCheck, DevicePlatform) +{ + afcl::platform platform = afcl::getPlatform(); + ASSERT_NE(platform, AFCL_PLATFORM_UNKNOWN); +} + #else TEST(OCLExtContext, NoopCPU) { From 34c8c97c8f2a6c8424433ddd89bc5b23c646985e Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 18 Jan 2016 19:15:33 -0500 Subject: [PATCH 202/288] Work around for a bug in AMD's clBuildProgram - Get stuck when the kernel is too large --- src/backend/opencl/Array.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index 207a4b0de7..044a6322ab 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include using af::dim4; @@ -23,6 +25,7 @@ namespace opencl { const int MAX_JIT_LEN = 20; + const int MAX_JIT_LEN_AMD = 16; //FIXME: Change this when bug is fixed using JIT::BufferNode; using JIT::Node; using JIT::Node_ptr; @@ -153,6 +156,14 @@ namespace opencl using af::dim4; + inline bool is_max_jit_len(const unsigned &len) + { + if (getActivePlatform() == AFCL_PLATFORM_AMD) { + return len >= MAX_JIT_LEN_AMD; + } + return len >= MAX_JIT_LEN; + } + template Array createNodeArray(const dim4 &dims, Node_ptr node) { @@ -166,7 +177,7 @@ namespace opencl n->getInfo(length, buf_count, bytes); n->resetFlags(); - if (length > MAX_JIT_LEN || + if (is_max_jit_len(length) || buf_count >= MAX_BUFFERS || bytes >= MAX_BYTES) { out.eval(); From ffc6e7f251cb37889b5fecfc06a385db5d24b4b0 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 18 Jan 2016 22:41:21 -0500 Subject: [PATCH 203/288] Putting transform coordinates within version guards --- include/af/image.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/af/image.h b/include/af/image.h index d25f64f058..0e0c0ba901 100644 --- a/include/af/image.h +++ b/include/af/image.h @@ -223,6 +223,7 @@ AFAPI array rotate(const array& in, const float theta, const bool crop=true, con */ AFAPI array transform(const array& in, const array& transform, const dim_t odim0 = 0, const dim_t odim1 = 0, const interpType method=AF_INTERP_NEAREST, const bool inverse=true); +#if AF_API_VERSION >= 33 /** C++ Interface for transforming coordinates @@ -234,6 +235,7 @@ AFAPI array transform(const array& in, const array& transform, const dim_t odim0 \ingroup transform_func_coordinates */ AFAPI array transformCoordinates(const array& tf, const float d0, const float d1); +#endif /** C++ Interface for translating an image @@ -865,6 +867,7 @@ extern "C" { const dim_t odim0, const dim_t odim1, const af_interp_type method, const bool inverse); +#if AF_API_VERSION >= 33 /** C Interface for transforming an image C++ Interface for transforming coordinates @@ -877,6 +880,7 @@ extern "C" { \ingroup transform_func_coordinates */ AFAPI af_err af_transform_coordinates(af_array *out, const af_array tf, const float d0, const float d1); +#endif /** C Interface for rotating an image From e7e608023b4a4bc431b585caf33393f85015129b Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 8 Jan 2016 17:17:11 -0500 Subject: [PATCH 204/288] Update clBLAS release tag --- CMakeModules/build_clBLAS.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeModules/build_clBLAS.cmake b/CMakeModules/build_clBLAS.cmake index 6cb1ae8aaf..d486b31801 100644 --- a/CMakeModules/build_clBLAS.cmake +++ b/CMakeModules/build_clBLAS.cmake @@ -14,7 +14,7 @@ ENDIF() ExternalProject_Add( clBLAS-ext GIT_REPOSITORY https://github.com/arrayfire/clBLAS.git - GIT_TAG 102c832825e8e4d60ad73ca97e95668463294068 + GIT_TAG arrayfire-release-test PREFIX "${prefix}" INSTALL_DIR "${prefix}" UPDATE_COMMAND "" From 845d3b3d31d2173255d4c1e8df66f04bf501ab42 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 18 Jan 2016 14:59:02 -0500 Subject: [PATCH 205/288] Fixes in magma potrf (opencl cholesky) --- src/backend/opencl/magma/potrf.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/opencl/magma/potrf.cpp b/src/backend/opencl/magma/potrf.cpp index d048ed4dac..4f9984f325 100644 --- a/src/backend/opencl/magma/potrf.cpp +++ b/src/backend/opencl/magma/potrf.cpp @@ -199,7 +199,7 @@ magma_int_t magma_potrf_gpu( magma_getmatrix_async(jb, jb, dA(j,j), ldda, work, jb, queue, &event); // apply all previous updates to block row right of diagonal block - if (j+jb < n) { + if (j+jb < n && j > 0) { CLBLAS_CHECK(gpu_blas_gemm( transType, clblasNoTrans, jb, n-j-jb, j, @@ -259,7 +259,7 @@ magma_int_t magma_potrf_gpu( magma_getmatrix_async(jb, jb, dA(j,j), ldda, work, jb, queue, &event); // apply all previous updates to block column below diagonal block - if (j+jb < n) { + if (j+jb < n && j > 0) { CLBLAS_CHECK(gpu_blas_gemm( clblasNoTrans, transType, n-j-jb, jb, j, From 3ce49a5dd347892d9a9226dc9d31360e7aa2851c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 20 Jan 2016 14:40:28 -0500 Subject: [PATCH 206/288] BUGFIX Fix how streams are created in setActiveDevice (CUDA) Ref 968ae4e80ce8e6263fdc3f4381ae8b895df44bc4 --- src/backend/cuda/platform.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 6919a04158..d172903084 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -395,7 +395,11 @@ int DeviceManager::setActiveDevice(int device, int nId) int old = activeDev; if(nId == -1) nId = getDeviceNativeId(device); CUDA_CHECK(cudaSetDevice(nId)); - cudaError_t err = cudaStreamCreate(&streams[device]); + + cudaError_t err = cudaSuccess; + if(!streams[device]) + err = cudaStreamCreate(&streams[device]); + activeDev = device; if (err == cudaSuccess) return old; From 58fc4c8ea4869b62ad75627f24a8a274d51ce26e Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Wed, 20 Jan 2016 16:58:08 -0500 Subject: [PATCH 207/288] Fixes to getMappedPtr in OpenCL backend - Also changed CL_TO_AF_ERROR to display OpenCL error number --- src/backend/opencl/Array.hpp | 6 ++++-- src/backend/opencl/err_opencl.hpp | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index 2793d5e099..4c8c05a231 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -227,8 +227,10 @@ namespace opencl try { if(ptr == nullptr) { ptr = (T*)getQueue().enqueueMapBuffer(*const_cast(get()), - true, CL_MAP_READ|CL_MAP_WRITE, - getOffset(), getDataDims().elements() * sizeof(T)); + true, CL_MAP_READ|CL_MAP_WRITE, + getOffset(), + (getDataDims().elements() - getOffset()) + * sizeof(T)); } } catch(cl::Error err) { CL_TO_AF_ERROR(err); diff --git a/src/backend/opencl/err_opencl.hpp b/src/backend/opencl/err_opencl.hpp index 15855f3b08..955275203a 100644 --- a/src/backend/opencl/err_opencl.hpp +++ b/src/backend/opencl/err_opencl.hpp @@ -23,8 +23,8 @@ char opencl_err_msg[1024]; \ snprintf(opencl_err_msg, \ sizeof(opencl_err_msg), \ - "OpenCL Error: %s when calling %s", \ - getErrorMessage(ERR.err()).c_str(), \ + "OpenCL Error (%d): %s when calling %s", \ + ERR.err(), getErrorMessage(ERR.err()).c_str(), \ ERR.what()); \ if (ERR.err() == CL_MEM_OBJECT_ALLOCATION_FAILURE) { \ AF_ERROR(opencl_err_msg, AF_ERR_NO_MEM); \ From d3d2996374de5cf4b59e60dc024ae40b72139d93 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 20 Jan 2016 17:51:37 -0500 Subject: [PATCH 208/288] Add getHostMemorySize and getDeviceMemorySize functions * Print memory size in CPU and OpenCL info * Change opencl::getDevice to accept id * Fix multi-line error strings --- src/api/c/err_common.cpp | 8 +-- src/backend/cpu/platform.cpp | 24 ++++++- src/backend/cpu/platform.hpp | 4 ++ src/backend/cuda/platform.cpp | 11 ++++ src/backend/cuda/platform.hpp | 4 ++ src/backend/host_memory.cpp | 113 ++++++++++++++++++++++++++++++++ src/backend/host_memory.hpp | 18 +++++ src/backend/opencl/platform.cpp | 22 ++++++- src/backend/opencl/platform.hpp | 10 ++- 9 files changed, 204 insertions(+), 10 deletions(-) create mode 100644 src/backend/host_memory.cpp create mode 100644 src/backend/host_memory.hpp diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp index 382dac1af1..495967a891 100644 --- a/src/api/c/err_common.cpp +++ b/src/api/c/err_common.cpp @@ -198,13 +198,13 @@ const char *af_err_to_string(const af_err err) case AF_ERR_BATCH: return "Invalid batch configuration"; case AF_ERR_NOT_SUPPORTED: return "Function not supported"; case AF_ERR_NOT_CONFIGURED: return "Function not configured to build"; - case AF_ERR_NONFREE: return "Function unavailable." + case AF_ERR_NONFREE: return "Function unavailable. " "ArrayFire compiled without Non-Free algorithms support"; case AF_ERR_NO_DBL: return "Double precision not supported for this device"; - case AF_ERR_NO_GFX: return "Graphics functionality unavailable." + case AF_ERR_NO_GFX: return "Graphics functionality unavailable. " "ArrayFire compiled without Graphics support"; - case AF_ERR_LOAD_LIB: return "Failed to load dynamic library." - "See http://www.arrayfire.com/docs/unifiedbackend.htm" + case AF_ERR_LOAD_LIB: return "Failed to load dynamic library. " + "See http://www.arrayfire.com/docs/unifiedbackend.htm " "for instructions to set up environment for Unified backend"; case AF_ERR_LOAD_SYM: return "Failed to load symbol"; case AF_ERR_ARR_BKND_MISMATCH: return "There was a mismatch between an array and the current backend"; diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index 0039b208d9..49abda3c8d 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -197,6 +198,15 @@ static const std::string get_system(void) #endif } +// http://stackoverflow.com/questions/216823/whats-the-best-way-to-trim-stdstring/217605#217605 +// trim from start +static inline std::string <rim(std::string &s) +{ + s.erase(s.begin(), std::find_if(s.begin(), s.end(), + std::not1(std::ptr_fun(std::isspace)))); + return s; +} + std::string getInfo() { std::ostringstream info; @@ -204,7 +214,9 @@ std::string getInfo() info << "ArrayFire v" << AF_VERSION << " (CPU, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; - info << string("[0] ") << cinfo.vendor() <<": " << cinfo.model() << " "; + std::string model = cinfo.model(); + info << string("[0] ") << cinfo.vendor() <<": " << ltrim(model) + << ", " << (int)(getDeviceMemorySize(getActiveDeviceId()) / 1048576.0) << " MB, "; info << "Max threads("<< cinfo.threads()<<") "; #ifndef NDEBUG info << AF_COMPILER_STR; @@ -249,6 +261,16 @@ int getActiveDeviceId() return 0; } +size_t getDeviceMemorySize(int device) +{ + return common::getHostMemorySize(); +} + +size_t getHostMemorySize() +{ + return common::getHostMemorySize(); +} + static const int MAX_QUEUES = 1; diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp index 0cd42ae068..9118ade8bd 100644 --- a/src/backend/cpu/platform.hpp +++ b/src/backend/cpu/platform.hpp @@ -28,6 +28,10 @@ namespace cpu { int getActiveDeviceId(); + size_t getDeviceMemorySize(int device); + + size_t getHostMemorySize(); + void sync(int device); queue& getQueue(int idx = 0); diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index d172903084..46b730314f 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -23,6 +23,7 @@ #include #include #include +#include using namespace std; @@ -304,6 +305,16 @@ cudaStream_t getStream(int device) return str; } +size_t getDeviceMemorySize(int device) +{ + return getDeviceProp(device).totalGlobalMem; +} + +size_t getHostMemorySize() +{ + return common::getHostMemorySize(); +} + int setDevice(int device) { return DeviceManager::getInstance().setActiveDevice(device); diff --git a/src/backend/cuda/platform.hpp b/src/backend/cuda/platform.hpp index 20862fb886..9302f4160e 100644 --- a/src/backend/cuda/platform.hpp +++ b/src/backend/cuda/platform.hpp @@ -46,6 +46,10 @@ int getDeviceNativeId(int device); cudaStream_t getStream(int device); +size_t getDeviceMemorySize(int device); + +size_t getHostMemorySize(); + int setDevice(int device); void sync(int device); diff --git a/src/backend/host_memory.cpp b/src/backend/host_memory.cpp new file mode 100644 index 0000000000..9b4f1e5f54 --- /dev/null +++ b/src/backend/host_memory.cpp @@ -0,0 +1,113 @@ +/* + * Author: David Robert Nadeau + * Site: http://NadeauSoftware.com/ + * License: Creative Commons Attribution 3.0 Unported License + * http://creativecommons.org/licenses/by/3.0/deed.en_US + * Source: http://nadeausoftware.com/sites/NadeauSoftware.com/files/getMemorySize.c + */ + +#include "host_memory.hpp" + +#if defined(_WIN32) +#include + +#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) +#include +#include +#include + +#if defined(BSD) +#include +#endif + +#else +#define NOMEMORYSIZE +#endif + +namespace common +{ + +#ifdef NOMEMORYSIZE +size_t getHostMemorySize() +{ + return 0L; // Can't detect +} + +#else + +/** + * Returns the size of physical memory (RAM) in bytes. + */ +size_t getHostMemorySize() +{ +#if defined(_WIN32) && (defined(__CYGWIN__) || defined(__CYGWIN32__)) + /* Cygwin under Windows. ------------------------------------ */ + /* New 64-bit MEMORYSTATUSEX isn't available. Use old 32.bit */ + MEMORYSTATUS status; + status.dwLength = sizeof(status); + GlobalMemoryStatus( &status ); + return (size_t)status.dwTotalPhys; + +#elif defined(_WIN32) + /* Windows. ------------------------------------------------- */ + /* Use new 64-bit MEMORYSTATUSEX, not old 32-bit MEMORYSTATUS */ + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx( &status ); + return (size_t)status.ullTotalPhys; + +#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) + /* UNIX variants. ------------------------------------------- */ + /* Prefer sysctl() over sysconf() except sysctl() HW_REALMEM and HW_PHYSMEM */ + +#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) + int mib[2]; + mib[0] = CTL_HW; +#if defined(HW_MEMSIZE) + mib[1] = HW_MEMSIZE; /* OSX. --------------------- */ +#elif defined(HW_PHYSMEM64) + mib[1] = HW_PHYSMEM64; /* NetBSD, OpenBSD. --------- */ +#endif + int64_t size = 0; /* 64-bit */ + size_t len = sizeof( size ); + if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 ) + return (size_t)size; + return 0L; /* Failed? */ + +#elif defined(_SC_AIX_REALMEM) + /* AIX. ----------------------------------------------------- */ + return (size_t)sysconf( _SC_AIX_REALMEM ) * (size_t)1024L; + +#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) + /* FreeBSD, Linux, OpenBSD, and Solaris. -------------------- */ + return (size_t)sysconf( _SC_PHYS_PAGES ) * + (size_t)sysconf( _SC_PAGESIZE ); + +#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) + /* Legacy. -------------------------------------------------- */ + return (size_t)sysconf( _SC_PHYS_PAGES ) * + (size_t)sysconf( _SC_PAGE_SIZE ); + +#elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) + /* DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX. -------- */ + int mib[2]; + mib[0] = CTL_HW; +#if defined(HW_REALMEM) + mib[1] = HW_REALMEM; /* FreeBSD. ----------------- */ +#elif defined(HW_PYSMEM) + mib[1] = HW_PHYSMEM; /* Others. ------------------ */ +#endif + unsigned int size = 0; /* 32-bit */ + size_t len = sizeof( size ); + if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 ) + return (size_t)size; + return 0L; /* Failed? */ +#endif /* sysctl and sysconf variants */ + +#else + return 0L; /* Unknown OS. */ +#endif +} + +#endif // NOMEMORYSIZE +} // namespace common diff --git a/src/backend/host_memory.hpp b/src/backend/host_memory.hpp new file mode 100644 index 0000000000..5955cbfbd9 --- /dev/null +++ b/src/backend/host_memory.hpp @@ -0,0 +1,18 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include + +namespace common +{ + +size_t getHostMemorySize(); + +} diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 884dca14d1..94efd7a876 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -43,6 +43,7 @@ #include #include #include +#include using std::string; using std::vector; @@ -404,7 +405,9 @@ std::string getInfo() std::to_string(nDevices) + (show_braces ? string("]") : "-"); - info << id << " " << getPlatformName(*device) << ": " << ltrim(dstr); + size_t msize = device->getInfo(); + info << id << " " << getPlatformName(*device) << ": " << ltrim(dstr) + << ", " << msize / 1048576 << " MB"; #ifndef NDEBUG info << " -- "; string devVersion = device->getInfo(); @@ -481,10 +484,23 @@ CommandQueue& getQueue() return *(devMngr.mQueues[devMngr.mActiveQId]); } -const cl::Device& getDevice() +const cl::Device& getDevice(int id) { DeviceManager& devMngr = DeviceManager::getInstance(); - return *(devMngr.mDevices[devMngr.mActiveQId]); + if(id == -1) id = devMngr.mActiveQId; + return *(devMngr.mDevices[id]); +} + +size_t getDeviceMemorySize(int device) +{ + const cl::Device& dev = getDevice(device); + size_t msize = dev.getInfo(); + return msize; +} + +size_t getHostMemorySize() +{ + return common::getHostMemorySize(); } cl_device_type getDeviceType() diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index d4f9f0e5ef..9b5377dc3c 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -33,7 +33,9 @@ class DeviceManager friend cl::CommandQueue& getQueue(); - friend const cl::Device& getDevice(); + friend const cl::Device& getDevice(int id); + + friend size_t getDeviceMemorySize(int device); friend bool isGLSharingSupported(); @@ -100,7 +102,11 @@ const cl::Context& getContext(); cl::CommandQueue& getQueue(); -const cl::Device& getDevice(); +const cl::Device& getDevice(int id = -1); + +size_t getDeviceMemorySize(int device); + +size_t getHostMemorySize(); cl_device_type getDeviceType(); From 6c306528854bafd05599ffbb928aaa171c6d0c9c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 20 Jan 2016 18:30:03 -0500 Subject: [PATCH 209/288] Updates to Memory Manager and Garbage Collection Heuristics * Add getMaxMemorySize virtual function to MemoryManager * Used to compute the max_bytes for each memory manager instance * max_bytes * This is now a part of memory_info * Gets it's size from the device memory rather than a generic size * For CPU and CUDA Pinned Memory, use getHostDeviceMemory * Add getMaxBytes function to fetch the max_bytes of active device * MAX_BUFFERS is now 1000 Fix missing include --- src/backend/MemoryManager.cpp | 23 ++++++++++++++++++----- src/backend/MemoryManager.hpp | 11 +++++++++-- src/backend/cpu/Array.cpp | 2 +- src/backend/cpu/memory.cpp | 12 +++++++++++- src/backend/cpu/memory.hpp | 5 +++-- src/backend/cpu/platform.cpp | 9 +++++++-- src/backend/cuda/Array.cpp | 2 +- src/backend/cuda/memory.cpp | 20 ++++++++++++++++++-- src/backend/cuda/memory.hpp | 5 +++-- src/backend/opencl/Array.cpp | 2 +- src/backend/opencl/memory.cpp | 20 ++++++++++++++++++-- src/backend/opencl/memory.hpp | 5 +++-- 12 files changed, 93 insertions(+), 23 deletions(-) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index cea4ae6b76..03cefe710d 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "MemoryManager.hpp" #include "dispatch.hpp" #include "err_common.hpp" @@ -18,10 +19,9 @@ namespace common { -MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, unsigned MAX_BYTES, bool debug): +MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug): mem_step_size(1024), max_buffers(MAX_BUFFERS), - max_bytes(MAX_BYTES), memory(num_devices), debug_mode(debug) { @@ -32,9 +32,16 @@ MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, unsigned MAX } if (this->debug_mode) mem_step_size = 1; + static const size_t oneGB = 1 << 30; for (int n = 0; n < num_devices; n++) { - memory[n].total_bytes = 0; - memory[n].lock_bytes = 0; + size_t memsize = getMaxMemorySize(n); + // Calls garbage collection when: + // total_bytes > memsize * 0.75 when memsize < 4GB + // total_bytes > memsize - 1 GB when memsize >= 4GB + // If memsize returned 0, then use 1GB + memory[n].max_bytes = memsize == 0 ? oneGB : std::max(memsize * 0.75, (double)(memsize - oneGB)); + memory[n].total_bytes = 0; + memory[n].lock_bytes = 0; memory[n].lock_buffers = 0; } } @@ -115,7 +122,7 @@ void *MemoryManager::alloc(const size_t bytes) // FIXME: Add better checks for garbage collection // Perhaps look at total memory available as a metric if (current.map.size() > this->max_buffers || - current.lock_bytes >= this->max_bytes) { + current.lock_bytes >= current.max_bytes) { this->garbageCollect(); } @@ -204,6 +211,12 @@ void MemoryManager::setMemStepSize(size_t new_step_size) this->mem_step_size = new_step_size; } +size_t MemoryManager::getMaxBytes() +{ + lock_guard_t lock(this->memory_mutex); + return this->getCurrentMemoryInfo().max_bytes; +} + void MemoryManager::printInfo(const char *msg, const int device) { lock_guard_t lock(this->memory_mutex); diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp index 5de8e4d823..8bb9941b87 100644 --- a/src/backend/MemoryManager.hpp +++ b/src/backend/MemoryManager.hpp @@ -37,11 +37,11 @@ class MemoryManager size_t lock_bytes; size_t lock_buffers; size_t total_bytes; + size_t max_bytes; } memory_info; size_t mem_step_size; unsigned max_buffers; - unsigned max_bytes; std::vector memory; bool debug_mode; @@ -55,8 +55,13 @@ class MemoryManager return 0; } + virtual size_t getMaxMemorySize(int id) + { + return 0; + } + public: - MemoryManager(int num_devices, unsigned MAX_BUFFERS, unsigned MAX_BYTES, bool debug); + MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug); void *alloc(const size_t bytes); @@ -75,6 +80,8 @@ class MemoryManager size_t getMemStepSize(); + size_t getMaxBytes(); + void setMemStepSize(size_t new_step_size); virtual void *nativeAlloc(const size_t bytes) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 862c576afe..891604cd27 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -161,7 +161,7 @@ createNodeArray(const dim4 &dims, Node_ptr node) if (length > MAX_TNJ_LEN || buf_count >= MAX_BUFFERS || - bytes >= MAX_BYTES) { + bytes >= getMaxBytes()) { out.eval(); } diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index c387b68b71..09a0e83c80 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -29,6 +29,7 @@ namespace cpu class MemoryManager : public common::MemoryManager { int getActiveDeviceId(); + size_t getMaxMemorySize(int id); public: MemoryManager(); void *nativeAlloc(const size_t bytes); @@ -48,8 +49,13 @@ int MemoryManager::getActiveDeviceId() return cpu::getActiveDeviceId(); } +size_t MemoryManager::getMaxMemorySize(int id) +{ + return cpu::getDeviceMemorySize(id); +} + MemoryManager::MemoryManager() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_CPU_MEM_DEBUG) + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_CPU_MEM_DEBUG) {} @@ -79,6 +85,10 @@ size_t getMemStepSize(void) return getMemoryManager().getMemStepSize(); } +size_t getMaxBytes() +{ + return getMemoryManager().getMaxBytes(); +} void garbageCollect() { diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 279b3dbd28..8f61f11f7b 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -26,8 +26,9 @@ namespace cpu template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 100; - static const unsigned MAX_BYTES = 100 * (1 << 20); + static const unsigned MAX_BUFFERS = 1000; + + size_t getMaxBytes(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index 49abda3c8d..65a5ab1faf 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -215,8 +216,12 @@ std::string getInfo() info << "ArrayFire v" << AF_VERSION << " (CPU, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; std::string model = cinfo.model(); - info << string("[0] ") << cinfo.vendor() <<": " << ltrim(model) - << ", " << (int)(getDeviceMemorySize(getActiveDeviceId()) / 1048576.0) << " MB, "; + size_t memMB = getDeviceMemorySize(getActiveDeviceId()) / 1048576; + info << string("[0] ") << cinfo.vendor() <<": " << ltrim(model); + + if(memMB) info << ", " << memMB << " MB, "; + else info << ", Unknown MB, "; + info << "Max threads("<< cinfo.threads()<<") "; #ifndef NDEBUG info << AF_COMPILER_STR; diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 39cd06c43b..1ca6012211 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -150,7 +150,7 @@ namespace cuda if (length > MAX_JIT_LEN || buf_count >= MAX_BUFFERS || - bytes >= MAX_BYTES) { + bytes >= getMaxBytes()) { out.eval(); } diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 15786d9498..f5dc6ca048 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -37,6 +37,7 @@ namespace cuda class MemoryManager : public common::MemoryManager { int getActiveDeviceId(); + size_t getMaxMemorySize(int id); public: MemoryManager(); void *nativeAlloc(const size_t bytes); @@ -66,6 +67,7 @@ class MemoryManager : public common::MemoryManager class MemoryManagerPinned : public common::MemoryManager { int getActiveDeviceId(); + size_t getMaxMemorySize(int id); public: MemoryManagerPinned(); void *nativeAlloc(const size_t bytes); @@ -82,8 +84,13 @@ int MemoryManager::getActiveDeviceId() return cuda::getActiveDeviceId(); } +size_t MemoryManager::getMaxMemorySize(int id) +{ + return cuda::getDeviceMemorySize(id); +} + MemoryManager::MemoryManager() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) {} void *MemoryManager::nativeAlloc(const size_t bytes) @@ -112,8 +119,13 @@ int MemoryManagerPinned::getActiveDeviceId() return 0; // pinned uses a single vector } +size_t MemoryManagerPinned::getMaxMemorySize(int id) +{ + return cuda::getHostMemorySize(); +} + MemoryManagerPinned::MemoryManagerPinned() : - common::MemoryManager(1, MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) + common::MemoryManager(1, MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) {} void *MemoryManagerPinned::nativeAlloc(const size_t bytes) @@ -147,6 +159,10 @@ size_t getMemStepSize(void) return getMemoryManager().getMemStepSize(); } +size_t getMaxBytes() +{ + return getMemoryManager().getMaxBytes(); +} void garbageCollect() { diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index 5b362cd587..590ba3b880 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -25,8 +25,9 @@ namespace cuda template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 100; - static const unsigned MAX_BYTES = (1 << 30); + static const unsigned MAX_BUFFERS = 1000; + + size_t getMaxBytes(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index 044a6322ab..7b6a26eb4d 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -179,7 +179,7 @@ namespace opencl if (is_max_jit_len(length) || buf_count >= MAX_BUFFERS || - bytes >= MAX_BYTES) { + bytes >= getMaxBytes()) { out.eval(); } diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 2427581f93..7054e96479 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -32,6 +32,7 @@ namespace opencl class MemoryManager : public common::MemoryManager { int getActiveDeviceId(); + size_t getMaxMemorySize(int id); public: MemoryManager(); void *nativeAlloc(const size_t bytes); @@ -52,6 +53,7 @@ class MemoryManagerPinned : public common::MemoryManager std::map > pinned_maps; int getActiveDeviceId(); + size_t getMaxMemorySize(int id); public: @@ -80,8 +82,13 @@ int MemoryManager::getActiveDeviceId() return opencl::getActiveDeviceId(); } +size_t MemoryManager::getMaxMemorySize(int id) +{ + return opencl::getDeviceMemorySize(id); +} + MemoryManager::MemoryManager() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG) + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG) {} void *MemoryManager::nativeAlloc(const size_t bytes) @@ -113,8 +120,13 @@ int MemoryManagerPinned::getActiveDeviceId() return opencl::getActiveDeviceId(); } +size_t MemoryManagerPinned::getMaxMemorySize(int id) +{ + return opencl::getDeviceMemorySize(id); +} + MemoryManagerPinned::MemoryManagerPinned() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, MAX_BYTES, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG), + common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG), pinned_maps(getDeviceCount()) {} @@ -163,6 +175,10 @@ size_t getMemStepSize(void) return getMemoryManager().getMemStepSize(); } +size_t getMaxBytes() +{ + return getMemoryManager().getMaxBytes(); +} void garbageCollect() { diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index da27e0d8d5..ea40b4b96f 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -30,8 +30,9 @@ namespace opencl template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 100; - static const unsigned MAX_BYTES = (1 << 30); + static const unsigned MAX_BUFFERS = 1000; + + size_t getMaxBytes(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); From 043739fd255b8a7a90bd4e7722462888b999a885 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Jan 2016 14:38:45 -0500 Subject: [PATCH 210/288] Move ArrayFireConfig, CPack (as CPackConfig) into CMakeModules --- CMakeLists.txt | 8 ++++---- .../ArrayFireConfig.cmake.in | 0 .../ArrayFireConfigVersion.cmake.in | 0 CPack.cmake => CMakeModules/CPackConfig.cmake | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) rename ArrayFireConfig.cmake.in => CMakeModules/ArrayFireConfig.cmake.in (100%) rename ArrayFireConfigVersion.cmake.in => CMakeModules/ArrayFireConfigVersion.cmake.in (100%) rename CPack.cmake => CMakeModules/CPackConfig.cmake (98%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 61a78a635f..f54a9be748 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,7 +221,7 @@ ENDIF(FORGE_FOUND AND NOT USE_SYSTEM_FORGE) SET(INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include") SET(BACKEND_DIR "src/backend/\${lowerbackend}") CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfig.cmake.in + ${CMAKE_MODULE_PATH}/ArrayFireConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ArrayFireConfig.cmake @ONLY) @@ -231,11 +231,11 @@ STRING(REGEX REPLACE "[^/]+" ".." reldir "${AF_INSTALL_CMAKE_DIR}") SET(INCLUDE_DIR "\${CMAKE_CURRENT_LIST_DIR}/${reldir}/include") set(BACKEND_DIR) CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfig.cmake.in + ${CMAKE_MODULE_PATH}/ArrayFireConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/Install/ArrayFireConfig.cmake @ONLY) CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/ArrayFireConfigVersion.cmake.in + ${CMAKE_MODULE_PATH}/ArrayFireConfigVersion.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/ArrayFireConfigVersion.cmake @ONLY) INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/Install/ArrayFireConfig.cmake @@ -263,4 +263,4 @@ ENDIF(APPLE) ## # Packaging ## -include(${CMAKE_CURRENT_SOURCE_DIR}/CPack.cmake) +include(${CMAKE_MODULE_PATH}/CPackConfig.cmake) diff --git a/ArrayFireConfig.cmake.in b/CMakeModules/ArrayFireConfig.cmake.in similarity index 100% rename from ArrayFireConfig.cmake.in rename to CMakeModules/ArrayFireConfig.cmake.in diff --git a/ArrayFireConfigVersion.cmake.in b/CMakeModules/ArrayFireConfigVersion.cmake.in similarity index 100% rename from ArrayFireConfigVersion.cmake.in rename to CMakeModules/ArrayFireConfigVersion.cmake.in diff --git a/CPack.cmake b/CMakeModules/CPackConfig.cmake similarity index 98% rename from CPack.cmake rename to CMakeModules/CPackConfig.cmake index 2e7f1d5a03..de242a99b7 100644 --- a/CPack.cmake +++ b/CMakeModules/CPackConfig.cmake @@ -1,6 +1,6 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8) -include("${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/Version.cmake") +INCLUDE("${CMAKE_MODULE_PATH}/Version.cmake") # CPack package generation #SET(CPACK_GENERATOR "TGZ;STGZ") From cfd60f1fa85606293126f09f72f0bdc0a9ac0824 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Jan 2016 17:30:49 -0500 Subject: [PATCH 211/288] Move /bigobj definitions into main CMakeList (windows) /bigobj is now required for Debug builds for CPU. Since is it being used for 3 backends, it makes sense to move it into the central CMakeList --- CMakeLists.txt | 4 ++++ src/api/unified/CMakeLists.txt | 4 ---- src/backend/opencl/CMakeLists.txt | 4 ---- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f54a9be748..2cfeb18fed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,6 +152,10 @@ ELSE(${UNIX}) #Windows # http://www.kitware.com/blog/home/post/434 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /Gm-") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP /Gm-") + + # Builds that contain debug info require /bigobj + SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") + SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj") ENDIF(MSVC) ENDIF() diff --git a/src/api/unified/CMakeLists.txt b/src/api/unified/CMakeLists.txt index 21c9aebf97..6ed95d088c 100644 --- a/src/api/unified/CMakeLists.txt +++ b/src/api/unified/CMakeLists.txt @@ -32,10 +32,6 @@ ENDIF() # OS Definitions IF(UNIX) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread -Wno-comment") -ELSE(${UNIX}) #Windows - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") - SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj") ENDIF() ADD_LIBRARY(af SHARED diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index c9c47d0198..232b652bba 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -197,10 +197,6 @@ CL_KERNEL_TO_H( # OS Definitions IF(UNIX) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread -Wno-comment") -ELSE(${UNIX}) #Windows - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") - SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /bigobj") ENDIF() IF(DEFINED BLAS_SYM_FILE) From cc2dda092b6b12684a42837a4784b9b9bc75cc8f Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 21 Jan 2016 18:38:03 -0500 Subject: [PATCH 212/288] Fixes to build with MKL when INTEL_MKL_ROOT is exported --- CMakeModules/FindCBLAS.cmake | 64 ++++++++++++++++++++++++------- CMakeModules/FindLAPACKE.cmake | 56 +++++++++++++++++++-------- src/backend/cpu/CMakeLists.txt | 1 + src/backend/opencl/CMakeLists.txt | 4 ++ 4 files changed, 95 insertions(+), 30 deletions(-) diff --git a/CMakeModules/FindCBLAS.cmake b/CMakeModules/FindCBLAS.cmake index b0cd3bdca0..efef36b093 100644 --- a/CMakeModules/FindCBLAS.cmake +++ b/CMakeModules/FindCBLAS.cmake @@ -53,19 +53,40 @@ SET(CBLAS_ROOT_DIR CACHE STRING INCLUDE(CheckTypeSize) CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP) -SET(CBLAS_LIB_DIR) +IF (NOT INTEL_MKL_ROOT_DIR) + SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT}) +ENDIF() -SET(CBLAS_ROOT_DIR "${INTEL_MKL_ROOT_DIR}") +IF(NOT CBLAS_ROOT_DIR) -IF(CBLAS_ROOT_DIR) - IF(INTEL_MKL_ROOT_DIR) - IF ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(CBLAS_LIB_DIR "${INTEL_MKL_ROOT_DIR}/lib/intel64") - ELSE() - SET(CBLAS_LIB_DIR "${INTEL_MKL_ROOT_DIR}/lib/ia32") - ENDIF() + IF (ENV{CBLASDIR}) + SET(CBLAS_ROOT_DIR $ENV{CBLASDIR}) + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${INTEL_MKL_ROOT_DIR}/lib64") + ELSE() + SET(CBLAS_LIB32_DIR "${INTEL_MKL_ROOT_DIR}/lib") + ENDIF() + ENDIF() + + IF (ENV{CBLAS_ROOT_DIR}) + SET(CBLAS_ROOT_DIR $ENV{CBLAS_ROOT_DIR}) + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${INTEL_MKL_ROOT_DIR}/lib64") + ELSE() + SET(CBLAS_LIB32_DIR "${INTEL_MKL_ROOT_DIR}/lib") + ENDIF() + ENDIF() + + IF (INTEL_MKL_ROOT_DIR) + SET(CBLAS_ROOT_DIR ${INTEL_MKL_ROOT_DIR}) + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${INTEL_MKL_ROOT_DIR}/lib/intel64") + ELSE() + SET(CBLAS_LIB32_DIR "${INTEL_MKL_ROOT_DIR}/lib/ia32") ENDIF() - SET(CBLAS_INCLUDE_DIR "${INTEL_MKL_ROOT_DIR}/include") + ENDIF() + + SET(CBLAS_INCLUDE_DIR "${CBLAS_ROOT_DIR}/include") ENDIF() # Old CBLAS search @@ -116,14 +137,14 @@ MACRO(CHECK_ALL_LIBRARIES NAMES ${_library} PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV DYLD_LIBRARY_PATH - "{CBLAS_LIB_DIR}" + "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}" ) ELSE(APPLE) FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library} PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH - "${CBLAS_LIB_DIR}" + "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}" PATH_SUFFIXES atlas ) IF(NOT ${_prefix}_${library}_LIBRARY) @@ -132,7 +153,7 @@ MACRO(CHECK_ALL_LIBRARIES NAMES ${_library} PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH - "${CBLAS_LIB_DIR}" + "${CBLAS_LIB_DIR}" "${CBLAS_LIB32_DIR}" "${CBLAS_LIB64_DIR}" PATH_SUFFIXES atlas ) ENDIF(NOT ${_prefix}_${library}_LIBRARY) @@ -194,6 +215,23 @@ MACRO(CHECK_ALL_LIBRARIES ENDIF(NOT _libraries_work) ENDMACRO(CHECK_ALL_LIBRARIES) +# MKL CBLAS library? +IF(NOT CBLAS_LIBRARIES) + CHECK_ALL_LIBRARIES( + CBLAS_LIBRARIES + CBLAS + cblas_dgemm + "" + "mkl_rt" + "mkl_cblas.h" + FALSE, + TRUE) +ENDIF(NOT CBLAS_LIBRARIES) + +IF(CBLAS_LIBRARIES) + SET(MKL_FOUND ON) +ENDIF() + # Apple CBLAS library? IF(NOT CBLAS_LIBRARIES) CHECK_ALL_LIBRARIES( diff --git a/CMakeModules/FindLAPACKE.cmake b/CMakeModules/FindLAPACKE.cmake index 3bf8a1f362..dc4a045370 100644 --- a/CMakeModules/FindLAPACKE.cmake +++ b/CMakeModules/FindLAPACKE.cmake @@ -9,15 +9,33 @@ # LAPACK_INCLUDES ... LAPACKE include directory # -IF(NOT LAPACKE_ROOT AND ENV{LAPACKEDIR}) - SET(LAPACKE_ROOT $ENV{LAPACKEDIR}) +SET(LAPACKE_ROOT_DIR CACHE STRING + "Root directory for custom LAPACK implementation") + +IF (NOT INTEL_MKL_ROOT_DIR) + SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT}) +ENDIF() + +IF(NOT LAPACKE_ROOT_DIR) + + IF (ENV{LAPACKEDIR}) + SET(LAPACKE_ROOT_DIR $ENV{LAPACKEDIR}) + ENDIF() + + IF (ENV{LAPACKE_ROOT_DIR_DIR}) + SET(LAPACKE_ROOT_DIR $ENV{LAPACKE_ROOT_DIR}) + ENDIF() + + IF (INTEL_MKL_ROOT_DIR) + SET(LAPACKE_ROOT_DIR ${INTEL_MKL_ROOT_DIR}) + ENDIF() ENDIF() # Check if we can use PkgConfig FIND_PACKAGE(PkgConfig) #Determine from PKG -IF(PKG_CONFIG_FOUND AND NOT LAPACKE_ROOT) +IF(PKG_CONFIG_FOUND AND NOT LAPACKE_ROOT_DIR) PKG_CHECK_MODULES( PC_LAPACKE QUIET "lapacke") ENDIF() @@ -48,40 +66,41 @@ IF(PC_LAPACKE_FOUND) ELSE(PC_LAPACKE_FOUND) - IF(LAPACKE_ROOT) + IF(LAPACKE_ROOT_DIR) #find libs FIND_LIBRARY( LAPACKE_LIB - NAMES "lapacke" "LAPACKE" "liblapacke" - PATHS ${LAPACKE_ROOT} - PATH_SUFFIXES "lib" "lib64" + NAMES "lapacke" "LAPACKE" "liblapacke" "mkl_rt" + PATHS ${LAPACKE_ROOT_DIR} + PATH_SUFFIXES "lib" "lib64" "lib/ia32" "lib/intel64" DOC "LAPACKE Library" NO_DEFAULT_PATH ) FIND_LIBRARY( LAPACK_LIB - NAMES "lapack" "LAPACK" "liblapack" - PATHS ${LAPACKE_ROOT} - PATH_SUFFIXES "lib" "lib64" + NAMES "lapack" "LAPACK" "liblapack" "mkl_rt" + PATHS ${LAPACKE_ROOT_DIR} + PATH_SUFFIXES "lib" "lib64" "lib/ia32" "lib/intel64" DOC "LAPACK Library" NO_DEFAULT_PATH ) FIND_PATH( LAPACKE_INCLUDES - NAMES "lapacke.h" - PATHS ${LAPACKE_ROOT} + NAMES "lapacke.h" "mkl_lapacke.h" + PATHS ${LAPACKE_ROOT_DIR} PATH_SUFFIXES "include" DOC "LAPACKE Include Directory" NO_DEFAULT_PATH ) - ELSE() FIND_LIBRARY( LAPACKE_LIB - NAMES "lapacke" "liblapacke" "openblas" + NAMES "lapacke" "liblapacke" "openblas" "mkl_rt" PATHS ${PC_LAPACKE_LIBRARY_DIRS} ${LIB_INSTALL_DIR} + /opt/intel/mkl/lib/ia32 + /opt/intel/mkl/lib/intel64 /usr/lib64 /usr/lib /usr/local/lib64 @@ -92,10 +111,12 @@ ELSE(PC_LAPACKE_FOUND) ) FIND_LIBRARY( LAPACK_LIB - NAMES "lapack" "liblapack" "openblas" + NAMES "lapack" "liblapack" "openblas" "mkl_rt" PATHS ${PC_LAPACKE_LIBRARY_DIRS} ${LIB_INSTALL_DIR} + /opt/intel/mkl/lib/ia32 + /opt/intel/mkl/lib/intel64 /usr/lib64 /usr/lib /usr/local/lib64 @@ -106,17 +127,18 @@ ELSE(PC_LAPACKE_FOUND) ) FIND_PATH( LAPACKE_INCLUDES - NAMES "lapacke.h" + NAMES "lapacke.h" "mkl_lapacke.h" PATHS ${PC_LAPACKE_INCLUDE_DIRS} ${INCLUDE_INSTALL_DIR} + /opt/intel/mkl/include /usr/include /usr/local/include /sw/include /opt/local/include DOC "LAPACKE Include Directory" ) - ENDIF(LAPACKE_ROOT) + ENDIF(LAPACKE_ROOT_DIR) ENDIF(PC_LAPACKE_FOUND) SET(LAPACK_LIBRARIES ${LAPACKE_LIB} ${LAPACK_LIB}) diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index b0ab17a616..5dee6de3f5 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -41,6 +41,7 @@ IF(NOT LAPACK_FOUND) MESSAGE(WARNING "LAPACK not found. Functionality will be disabled") ELSE(NOT LAPACK_FOUND) ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA) + MESSAGE(STATUS "LAPACK libraries found: ${LAPACK_LIBRARIES}") ENDIF() IF(NOT UNIX) diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index c9c47d0198..6731c67605 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -42,6 +42,10 @@ ELSE(NOT LAPACK_FOUND) ENDIF() ENDIF() +IF(${MKL_FOUND}) + ADD_DEFINITIONS(-DUSE_MKL) +ENDIF() + IF(NOT UNIX) ADD_DEFINITIONS(-DAFDLL) ENDIF() From aba1851efcad92cd9c95cfd795de97ef2a9e6dcb Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Jan 2016 22:53:02 -0500 Subject: [PATCH 213/288] BUGFIX Add/remove entries for platform when adding external device/context --- src/backend/opencl/platform.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 94efd7a876..1abb03279f 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -699,6 +699,7 @@ void addDeviceContext(cl_device_id dev, cl_context ctx, cl_command_queue que) devMngr.mDevices.push_back(tDevice); devMngr.mContexts.push_back(tContext); devMngr.mQueues.push_back(tQueue); + devMngr.mPlatforms.push_back(getPlatformEnum(*tDevice)); // FIXME: add OpenGL Interop for user provided contexts later devMngr.mIsGLSharingOn.push_back(false); } catch (const cl::Error &ex) { @@ -757,6 +758,7 @@ void removeDeviceContext(cl_device_id dev, cl_context ctx) devMngr.mDevices.erase(devMngr.mDevices.begin()+deleteIdx); devMngr.mContexts.erase(devMngr.mContexts.begin()+deleteIdx); devMngr.mQueues.erase(devMngr.mQueues.begin()+deleteIdx); + devMngr.mPlatforms.erase(devMngr.mPlatforms.begin()+deleteIdx); // FIXME: add OpenGL Interop for user provided contexts later devMngr.mIsGLSharingOn.erase(devMngr.mIsGLSharingOn.begin()+deleteIdx); // OTHERWISE, update(decrement) the `mActive*Id` variables From 163b5fbf8d51e7c3f7666b3a68cd21740b8284e6 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 22 Jan 2016 22:55:19 -0500 Subject: [PATCH 214/288] BUGFIX Fix CUDA device management and free at destructor --- src/backend/cuda/kernel/random.hpp | 15 ++++++++++++--- src/backend/cuda/platform.cpp | 5 ++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/backend/cuda/kernel/random.hpp b/src/backend/cuda/kernel/random.hpp index 4d960ae46b..96cf098c03 100644 --- a/src/backend/cuda/kernel/random.hpp +++ b/src/backend/cuda/kernel/random.hpp @@ -49,8 +49,18 @@ namespace kernel ~curandStateManager() { - //if(_state != NULL) memFree((char*)_state); - if(_state != NULL) CUDA_CHECK(cudaFree(_state)); + try { + if (_state != NULL) { + cudaError_t err = cudaFree(_state); + if (err != cudaErrorCudartUnloading) { + CUDA_CHECK(err); + } + } + } catch (AfError err) { + if (err.getError() != AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable + throw err; + } + } } unsigned long long getSeed() const @@ -69,7 +79,6 @@ namespace kernel if(_state) return _state; - //_state = (curandState_t*)memAlloc(BLOCKS * THREADS * sizeof(curandState_t)); CUDA_CHECK(cudaMalloc((void **)&_state, BLOCKS * THREADS * sizeof(curandState_t))); this->resetSeed(); return _state; diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 46b730314f..744bf7eb2c 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -425,7 +425,7 @@ int DeviceManager::setActiveDevice(int device, int nId) // Comes only when first is true. Set it to false first = false; - while(device < numDevices) { + while(true) { // Check for errors other than DevicesUnavailable // If success, return. Else throw error // If DevicesUnavailable, try other devices (while loop below) @@ -435,12 +435,15 @@ int DeviceManager::setActiveDevice(int device, int nId) return old; } cudaGetLastError(); // Reset error stack +#ifndef NDEBUG printf("Warning: Device %d is unavailable. Incrementing to next device \n", device); +#endif // Comes here is the device is in exclusive mode or // otherwise fails streamCreate with this error. // All other errors will error out device++; + if (device >= numDevices) break; // Can't call getNativeId here as it will cause an infinite loop with the constructor nId = cuDevices[device].nativeId; From 805dc5b60937cc6d3f9ddd912b540bf967f5dfdc Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 22 Jan 2016 22:56:59 -0500 Subject: [PATCH 215/288] Fix Tests: ORB, Meanshift, basic_c, solve * Fix vector in fast_pyramid - use resize instead of reserve * Fix meanshift test. Use proper types and arrays * Fix memory leak in basic_c * Enable solve tests that were disabled for windows opencl --- src/backend/cuda/kernel/fast_pyramid.hpp | 6 +++++- test/basic_c.c | 1 + test/meanshift.cpp | 15 +++++++++------ test/solve_dense.cpp | 5 +---- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/backend/cuda/kernel/fast_pyramid.hpp b/src/backend/cuda/kernel/fast_pyramid.hpp index 61a9c7ac32..d2e5903788 100644 --- a/src/backend/cuda/kernel/fast_pyramid.hpp +++ b/src/backend/cuda/kernel/fast_pyramid.hpp @@ -65,7 +65,11 @@ void fast_pyramid(std::vector& feat_pyr, lvl_best[max_levels-1] = max_feat - feat_sum; // Hold multi-scale image pyramids - img_pyr.reserve(max_levels); + static const dim4 dims0; + static const CParam emptyCParam(NULL, dims0.get(), dims0.get()); + // Need to do this as CParam does not have a default constructor + // And resize needs a default constructor or default value prior to C++11 + img_pyr.resize(max_levels, emptyCParam); // Create multi-scale image pyramid for (unsigned i = 0; i < max_levels; i++) { diff --git a/test/basic_c.c b/test/basic_c.c index f6c731092a..0caca290ec 100644 --- a/test/basic_c.c +++ b/test/basic_c.c @@ -13,5 +13,6 @@ int main() { af_array out = 0; dim_t s[] = {10, 10, 1, 1}; af_err e = af_randu(&out, 4, s, f32); + if(out != 0) af_release_array(out); return (AF_SUCCESS != e); } diff --git a/test/meanshift.cpp b/test/meanshift.cpp index 34b622be1a..a35ca288d9 100644 --- a/test/meanshift.cpp +++ b/test/meanshift.cpp @@ -65,11 +65,12 @@ void meanshiftTest(string pTestFile) for (size_t testId=0; testId(&inArray, inArray_f32)); - ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray, outFiles[testId].c_str(), isColor)); + ASSERT_EQ(AF_SUCCESS, af_load_image(&goldArray_f32, outFiles[testId].c_str(), isColor)); + ASSERT_EQ(AF_SUCCESS, conv_image(&goldArray, goldArray_f32)); // af_load_image always returns float array ASSERT_EQ(AF_SUCCESS, af_get_elements(&nElems, goldArray)); ASSERT_EQ(AF_SUCCESS, af_mean_shift(&outArray, inArray, 2.25f, 25.56f, 5, isColor)); @@ -94,6 +96,7 @@ void meanshiftTest(string pTestFile) ASSERT_EQ(AF_SUCCESS, af_release_array(inArray_f32)); ASSERT_EQ(AF_SUCCESS, af_release_array(outArray)); ASSERT_EQ(AF_SUCCESS, af_release_array(goldArray)); + ASSERT_EQ(AF_SUCCESS, af_release_array(goldArray_f32)); } } diff --git a/test/solve_dense.cpp b/test/solve_dense.cpp index 09addc7c48..183afdbcc8 100644 --- a/test/solve_dense.cpp +++ b/test/solve_dense.cpp @@ -186,15 +186,12 @@ SOLVE_TESTS(cdouble, 1E-5) #define SOLVE_TESTS(T, eps) \ TEST(SOLVE, T##RectOver) \ { \ - solveTester(800, 600, 50, eps); \ + solveTester(800, 600, 64, eps); \ } SOLVE_TESTS(float, 0.01) SOLVE_TESTS(double, 1E-5) -// Fails on Windows on some devices -#if !(defined(OS_WIN) && defined(AF_OPENCL)) SOLVE_TESTS(cfloat, 0.01) SOLVE_TESTS(cdouble, 1E-5) -#endif #undef SOLVE_TESTS From 7eb905f1f05bd9e3906551dcc801c05254aa202d Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 25 Jan 2016 17:47:21 -0500 Subject: [PATCH 216/288] Add documentation for deviceInfo --- docs/details/device.dox | 16 ++++++++++++++++ include/af/device.h | 16 +++++----------- src/backend/cuda/platform.cpp | 4 ++-- src/backend/opencl/platform.cpp | 3 +-- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/docs/details/device.dox b/docs/details/device.dox index c89d2a17f0..1aa43e7465 100644 --- a/docs/details/device.dox +++ b/docs/details/device.dox @@ -2,6 +2,22 @@ \addtogroup arrayfire_func @{ +\defgroup device_func_prop deviceInfo +\ingroup device_mat + +\brief Gets the information about device and platform as strings + +\param d_name pointer to a user-allocated char array. Recommended minimum size is 64. +The name of the device is stored in this array. +\param d_platform pointer to a user-allocated char array. Recommended minimum size is 10. +The platform information is stored in this array. +\param d_toolkit pointer to a user-allocated char array. Recommended minimum size is 64. +The toolkit information is stored in this array. +\param d_compute pointer to a user-allocated char array. Recommended minimum size is 10. +The compute version of the device is stored in this array. + +=============================================================================== + \defgroup device_func_count getDeviceCount \ingroup device_mat diff --git a/include/af/device.h b/include/af/device.h index 28830675f8..c0d787ea80 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -50,19 +50,11 @@ namespace af */ /** - \defgroup device_func_prop deviceInfo + \copydoc device_func_prop - Get device information - - @{ - - \ingroup arrayfire_func - \ingroup device_mat + \ingroup device_func_prop */ AFAPI void deviceInfo(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); - /** - @} - */ /// \brief Gets the number of devices /// @@ -267,7 +259,9 @@ extern "C" { AFAPI af_err af_info_string(char** str, const bool verbose); /** - \ingroup device_func_prop + \copydoc device_func_prop + + \ingroup device_func_prop */ AFAPI af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 744bf7eb2c..5e53fc0034 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -214,7 +214,7 @@ void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) cudaDeviceProp dev = getDeviceProp(getActiveDeviceId()); // Name - snprintf(d_name, 32, "%s", dev.name); + snprintf(d_name, 64, "%s", dev.name); //Platform std::string cudaRuntime = getCUDARuntimeVersion(); @@ -225,7 +225,7 @@ void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) snprintf(d_compute, 10, "%d.%d", dev.major, dev.minor); // Sanitize input - for (int i = 0; i < 31; i++) { + for (int i = 0; i < 63; i++) { if (d_name[i] == ' ') { if (d_name[i + 1] == 0 || d_name[i + 1] == ' ') d_name[i] = 0; else d_name[i] = '_'; diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 1abb03279f..12bb71db51 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -418,8 +418,7 @@ std::string getInfo() info << " -- Device driver " << driVersion; info << " -- FP64 Support: " << (device->getInfo()>0 ? "True" : "False") - << ""; - info << "Unified Memory(" + info << " -- Unified Memory (" << (isHostUnifiedMemory(*device) ? "True" : "False") << ")"; #endif From 0039cdba798a675b3004217a37c004aaefe7a85f Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Wed, 27 Jan 2016 15:55:04 -0500 Subject: [PATCH 217/288] Proper exception handling for memory manager --- src/backend/MemoryManager.cpp | 9 +++++---- src/backend/cpu/memory.cpp | 4 +++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index 03cefe710d..814262829e 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -145,12 +145,13 @@ void *MemoryManager::alloc(const size_t bytes) } // Perform garbage collection if memory can not be allocated - ptr = this->nativeAlloc(alloc_bytes); - - if (!ptr) { + try { + ptr = this->nativeAlloc(alloc_bytes); + } catch (AfError &ex) { + // If out of memory, run garbage collect and try again + if (ex.getError() != AF_ERR_NO_MEM) throw; this->garbageCollect(); ptr = this->nativeAlloc(alloc_bytes); - if (!ptr) AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM); } buffer_info info = {true, false, alloc_bytes}; diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 09a0e83c80..cf7e1ba48b 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -61,7 +61,9 @@ MemoryManager::MemoryManager() : void *MemoryManager::nativeAlloc(const size_t bytes) { - return malloc(bytes); + void *ptr = malloc(bytes); + if (!ptr) AF_ERROR("Unable to allocate memory", AF_ERR_NO_MEM); + return ptr; } void MemoryManager::nativeFree(void *ptr) From 91bed334073ea413ce2633d976f0801d97a73677 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Wed, 27 Jan 2016 15:58:13 -0500 Subject: [PATCH 218/288] Removing unneeded cudaDeviceSynchronize() --- src/backend/cuda/copy.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu index 71893b8c16..df435d245c 100644 --- a/src/backend/cuda/copy.cu +++ b/src/backend/cuda/copy.cu @@ -71,7 +71,6 @@ namespace cuda ARG_ASSERT(1, (in.ndims() == dims.ndims())); Array ret = createEmptyArray(dims); kernel::copy(ret, in, in.ndims(), default_value, factor); - CUDA_CHECK(cudaDeviceSynchronize()); return ret; } From 519d3bb3f5a7e243223fea2da6709a6550b32816 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 28 Jan 2016 15:37:57 -0500 Subject: [PATCH 219/288] Adding compute 37 to list of accepted CUDA computes --- src/backend/cuda/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index 4c74070492..81d6ba243c 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -18,6 +18,7 @@ IF( CUDA_COMPUTE_20 OR CUDA_COMPUTE_30 OR CUDA_COMPUTE_32 OR CUDA_COMPUTE_35 + OR CUDA_COMPUTE_37 OR CUDA_COMPUTE_50 OR CUDA_COMPUTE_52 OR CUDA_COMPUTE_53 @@ -49,7 +50,7 @@ MACRO(SET_COMPUTE VERSION) ENDMACRO(SET_COMPUTE) # Iterate over compute versions. Create variables and enable computes if needed -FOREACH(VER 20 30 32 35 50 52 53) +FOREACH(VER 20 30 32 35 37 50 52 53) OPTION(CUDA_COMPUTE_${VER} "CUDA Compute Capability ${VER}" OFF) MARK_AS_ADVANCED(CUDA_COMPUTE_${VER}) IF(${CUDA_COMPUTE_${VER}}) From 96041b5f2103e0025cf378d11a184bba63cf1681 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 28 Jan 2016 17:09:20 -0500 Subject: [PATCH 220/288] BUGFIX: incorrect index for 3rd dimension in select / replace Affects both CUDA and OpenCL abckends --- src/backend/cuda/kernel/select.hpp | 4 ++-- src/backend/opencl/kernel/select.cl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/cuda/kernel/select.hpp b/src/backend/cuda/kernel/select.hpp index ab5bf2da7b..ea242e45dd 100644 --- a/src/backend/cuda/kernel/select.hpp +++ b/src/backend/cuda/kernel/select.hpp @@ -41,7 +41,7 @@ namespace cuda const int idw = blockIdx.y / blk_y; const int blockIdx_x = blockIdx.x - idz * blk_x; - const int blockIdx_y = blockIdx.y - idz * blk_y; + const int blockIdx_y = blockIdx.y - idw * blk_y; const int idx = blockIdx_x * blockDim.x + threadIdx.x; const int idy = blockIdx_y * blockDim.y + threadIdx.y; @@ -110,7 +110,7 @@ namespace cuda const int idw = blockIdx.y / blk_y; const int blockIdx_x = blockIdx.x - idz * blk_x; - const int blockIdx_y = blockIdx.y - idz * blk_y; + const int blockIdx_y = blockIdx.y - idw * blk_y; const int idx = blockIdx_x * blockDim.x + threadIdx.x; const int idy = blockIdx_y * blockDim.y + threadIdx.y; diff --git a/src/backend/opencl/kernel/select.cl b/src/backend/opencl/kernel/select.cl index 94a36031c3..03248be1b9 100644 --- a/src/backend/opencl/kernel/select.cl +++ b/src/backend/opencl/kernel/select.cl @@ -41,7 +41,7 @@ void select_kernel(__global T *optr, KParam oinfo, const int idw = get_group_id(1) / groups_1; const int group_id_0 = get_group_id(0) - idz * groups_0; - const int group_id_1 = get_group_id(1) - idz * groups_1; + const int group_id_1 = get_group_id(1) - idw * groups_1; const int idx = group_id_0 * get_local_size(0) + get_local_id(0); const int idy = group_id_1 * get_local_size(1) + get_local_id(1); @@ -80,7 +80,7 @@ void select_scalar_kernel(__global T *optr, KParam oinfo, const int idw = get_group_id(1) / groups_1; const int group_id_0 = get_group_id(0) - idz * groups_0; - const int group_id_1 = get_group_id(1) - idz * groups_1; + const int group_id_1 = get_group_id(1) - idw * groups_1; const int idx = group_id_0 * get_local_size(0) + get_local_id(0); const int idy = group_id_1 * get_local_size(1) + get_local_id(1); From 32426184a3f3b9ab5ab960a367edc89a5de34356 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 29 Jan 2016 10:32:48 +0530 Subject: [PATCH 221/288] Documentation fix in matchTemplate function --- docs/details/vision.dox | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/details/vision.dox b/docs/details/vision.dox index 1d9d6b99ac..99582c3729 100644 --- a/docs/details/vision.dox +++ b/docs/details/vision.dox @@ -166,9 +166,12 @@ from the other and returns the result. \brief Template Matching -Template matching is an image processing technique to find small patches of an image which -match a given template image. A more in depth discussion on the topic can be found -[here](http://en.wikipedia.org/wiki/Template_matching). +Template matching is an image processing technique to find small patches of an image which match a given template image. Currently, this function doesn't support the following three metrics yet. +- \ref AF_NCC +- \ref AF_ZNCC +- \ref AF_SHD + +A more in depth discussion about template matching can be found [here](http://en.wikipedia.org/wiki/Template_matching). ======================================================================= From 209643ba71796031f9d1dc5fbe273fe5a9ba3227 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 29 Jan 2016 18:28:24 +0530 Subject: [PATCH 222/288] syntax+typo fix in opencl backend --- src/backend/opencl/platform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 12bb71db51..9dbb3ca38c 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -417,7 +417,7 @@ std::string getInfo() info << devVersion; info << " -- Device driver " << driVersion; info << " -- FP64 Support: " - << (device->getInfo()>0 ? "True" : "False") + << (device->getInfo()>0 ? "True" : "False"); info << " -- Unified Memory (" << (isHostUnifiedMemory(*device) ? "True" : "False") << ")"; From 9bf14556a34895d3171793db5d6f54c0d7a0b555 Mon Sep 17 00:00:00 2001 From: pradeep Date: Fri, 29 Jan 2016 19:55:26 +0530 Subject: [PATCH 223/288] Updated test data for meanshift, bilateral & morph Replaced lena image from test data --- test/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data b/test/data index 414f02d905..cec85080f1 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit 414f02d90588ec2cde177202bd340c57be6e7d9a +Subproject commit cec85080f12c25486d025d1fb1cf69e1beb03e58 From f228de3243492817f4909991ec8c96457d42c6aa Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 28 Jan 2016 17:18:16 -0500 Subject: [PATCH 224/288] TEST: Adding tests for 3D and 4D select and replace --- test/replace.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++ test/select.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/test/replace.cpp b/test/replace.cpp index 9e99eaee8f..faa5636eb8 100644 --- a/test/replace.cpp +++ b/test/replace.cpp @@ -130,3 +130,46 @@ TEST(Replace, NaN) ASSERT_EQ(hc[i], std::isnan(ha[i]) ? b : ha[i]); } } + +TEST(Replace, ISSUE_1249) +{ + dim4 dims(2, 3, 4); + array cond = af::randu(dims) > 0.5; + array a = af::randu(dims); + array b = a.copy(); + replace(b, !cond, a - a * 0.9); + array c = a - a * cond * 0.9; + + int num = (int)dims.elements(); + std::vector hb(num); + std::vector hc(num); + + b.host(&hb[0]); + c.host(&hc[0]); + + for (int i = 0; i < num; i++) { + ASSERT_EQ(hc[i], hb[i]) << "at " << i; + } +} + + +TEST(Replace, 4D) +{ + dim4 dims(2, 3, 4, 2); + array cond = af::randu(dims) > 0.5; + array a = af::randu(dims); + array b = a.copy(); + replace(b, !cond, a - a * 0.9); + array c = a - a * cond * 0.9; + + int num = (int)dims.elements(); + std::vector hb(num); + std::vector hc(num); + + b.host(&hb[0]); + c.host(&hc[0]); + + for (int i = 0; i < num; i++) { + ASSERT_EQ(hc[i], hb[i]) << "at " << i; + } +} diff --git a/test/select.cpp b/test/select.cpp index 1c39282b15..6e772ac7c4 100644 --- a/test/select.cpp +++ b/test/select.cpp @@ -136,3 +136,43 @@ TEST(Select, NaN) ASSERT_EQ(hc[i], std::isnan(ha[i]) ? b : ha[i]); } } + +TEST(Select, ISSUE_1249) +{ + dim4 dims(2, 3, 4); + array cond = af::randu(dims) > 0.5; + array a = af::randu(dims); + array b = select(cond, a - a * 0.9, a); + array c = a - a * cond * 0.9; + + int num = (int)dims.elements(); + std::vector hb(num); + std::vector hc(num); + + b.host(&hb[0]); + c.host(&hc[0]); + + for (int i = 0; i < num; i++) { + ASSERT_EQ(hc[i], hb[i]) << "at " << i; + } +} + +TEST(Select, 4D) +{ + dim4 dims(2, 3, 4, 2); + array cond = af::randu(dims) > 0.5; + array a = af::randu(dims); + array b = select(cond, a - a * 0.9, a); + array c = a - a * cond * 0.9; + + int num = (int)dims.elements(); + std::vector hb(num); + std::vector hc(num); + + b.host(&hb[0]); + c.host(&hc[0]); + + for (int i = 0; i < num; i++) { + ASSERT_EQ(hc[i], hb[i]) << "at " << i; + } +} From 41bad15c0cea1bbbea2ba540c212a63102500137 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 28 Jan 2016 15:38:17 -0500 Subject: [PATCH 225/288] Changes required to build tests in a single file --- test/CMakeLists.txt | 48 ++++++++++++++++++++++++++++++++---------- test/basic_c.c | 3 ++- test/fast.cpp | 4 ++-- test/gloh_nonfree.cpp | 16 ++++++++------ test/harris.cpp | 4 ++-- test/main.cpp | 6 ++++++ test/orb.cpp | 12 +++++------ test/reduce.cpp | 10 --------- test/rotate_linear.cpp | 14 ++++++------ test/scan.cpp | 10 --------- test/sift_nonfree.cpp | 17 ++++++++------- test/sort_by_key.cpp | 13 ++++++------ test/sort_index.cpp | 12 +++++------ test/susan.cpp | 4 ++-- test/svd_dense.cpp | 4 ++-- test/testHelpers.hpp | 30 ++++++++++++++------------ test/where.cpp | 11 ---------- test/wrap.cpp | 10 ++++----- 18 files changed, 118 insertions(+), 110 deletions(-) create mode 100644 test/main.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1bcdde95af..bea93d554e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,6 +6,8 @@ SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules") FIND_PACKAGE(CUDA QUIET) FIND_PACKAGE(OpenCL QUIET) +OPTION(BUILD_SINGLE_TEST_FILE "Build tests in a single file" OFF) + # If the tests are not being built at the same time as ArrayFire, # we need to first find the ArrayFire library IF(TARGET afcpu OR TARGET afcuda OR TARGET afopencl OR TARGET af) @@ -58,14 +60,36 @@ MACRO(CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS) SET(TEST_FILES ${FILES}) ENDIF(${BACKEND} STREQUAL "unified") - FOREACH(FILE ${TEST_FILES}) + IF (${BUILD_SINGLE_TEST_FILE}) + SET(TEST_NAME test_${BACKEND}) + SET(TEST_NAME_BASIC test_basic_${BACKEND}) + ADD_EXECUTABLE(${TEST_NAME} ${CPP_FILES}) + ADD_EXECUTABLE(${TEST_NAME_BASIC} basic_c.c) + + TARGET_LINK_LIBRARIES(${TEST_NAME} PRIVATE ${AFLIBNAME} + PRIVATE ${THREAD_LIB_FLAG} + PRIVATE ${GTEST_LIBS} + PRIVATE ${OTHER_LIBS}) + + TARGET_LINK_LIBRARIES(${TEST_NAME_BASIC} PRIVATE ${AFLIBNAME} + PRIVATE ${THREAD_LIB_FLAG} + PRIVATE ${GTEST_LIBS} + PRIVATE ${OTHER_LIBS}) + + SET_TARGET_PROPERTIES(${TEST_NAME_BASIC} + PROPERTIES + COMPILE_FLAGS -DAF_${DEF_NAME} + FOLDER "Tests/${BACKEND}") + + ELSE() + FOREACH(FILE ${TEST_FILES}) GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE) SET(TEST_NAME ${FNAME}_${BACKEND}) IF(NOT ${BUILD_NONFREE} AND "${FILE}" MATCHES ".nonfree.") - MESSAGE(STATUS "Removing ${FILE} from ctest") + MESSAGE(STATUS "Removing ${FILE} from ctest") ELSEIF("${FILE}" MATCHES ".manual.") - MESSAGE(STATUS "Removing ${FILE} from ctest") + MESSAGE(STATUS "Removing ${FILE} from ctest") ELSE() ADD_TEST(Test_${TEST_NAME} ${TEST_NAME}) ENDIF() @@ -73,15 +97,16 @@ MACRO(CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS) FILE(GLOB TEST_FILE "${FNAME}.cpp" "${FNAME}.c") ADD_EXECUTABLE(${TEST_NAME} ${TEST_FILE}) TARGET_LINK_LIBRARIES(${TEST_NAME} PRIVATE ${AFLIBNAME} - PRIVATE ${THREAD_LIB_FLAG} - PRIVATE ${GTEST_LIBS} - PRIVATE ${OTHER_LIBS}) + PRIVATE ${THREAD_LIB_FLAG} + PRIVATE ${GTEST_LIBS} + PRIVATE ${OTHER_LIBS}) SET_TARGET_PROPERTIES(${TEST_NAME} - PROPERTIES - COMPILE_FLAGS -DAF_${DEF_NAME} - FOLDER "Tests/${BACKEND}") - ENDFOREACH() + PROPERTIES + COMPILE_FLAGS -DAF_${DEF_NAME} + FOLDER "Tests/${BACKEND}") + ENDFOREACH() + ENDIF() ENDMACRO(CREATE_TESTS) @@ -136,10 +161,11 @@ INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) FILE(GLOB FILES "*.cpp" "*.c") +FILE(GLOB CPP_FILES "*.cpp") LIST(SORT FILES) # Tests execute in alphabetical order # We only build backend.cpp for Unified backend -SET(UNIFIED_FILES "backend.cpp") +SET(UNIFIED_FILES "backend.cpp;main.cpp") LIST(SORT UNIFIED_FILES) # Tests execute in alphabetical order # Next we build each example using every backend. diff --git a/test/basic_c.c b/test/basic_c.c index 0caca290ec..aac34e142d 100644 --- a/test/basic_c.c +++ b/test/basic_c.c @@ -9,7 +9,8 @@ #include -int main() { +int main() +{ af_array out = 0; dim_t s[] = {10, 10, 1, 1}; af_err e = af_randu(&out, 4, s, f32); diff --git a/test/fast.cpp b/test/fast.cpp index e7df638b80..8cb90574a6 100644 --- a/test/fast.cpp +++ b/test/fast.cpp @@ -28,7 +28,7 @@ typedef struct float f[5]; } feat_t; -bool feat_cmp(feat_t i, feat_t j) +static bool feat_cmp(feat_t i, feat_t j) { for (int k = 0; k < 5; k++) if (i.f[k] != j.f[k]) @@ -37,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j) return false; } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { diff --git a/test/gloh_nonfree.cpp b/test/gloh_nonfree.cpp index f50e4031aa..5794051152 100644 --- a/test/gloh_nonfree.cpp +++ b/test/gloh_nonfree.cpp @@ -39,7 +39,8 @@ typedef struct float d[272]; } desc_t; -bool feat_cmp(feat_desc_t i, feat_desc_t j) +#ifdef AF_BUILD_NONFREE_SIFT +static bool feat_cmp(feat_desc_t i, feat_desc_t j) { for (int k = 0; k < 5; k++) if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f)) @@ -48,7 +49,7 @@ bool feat_cmp(feat_desc_t i, feat_desc_t j) return true; } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -62,7 +63,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -76,7 +77,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { @@ -88,7 +89,7 @@ void array_to_feat(vector& feat, float *x, float *y, float *score, float } } -void split_feat_desc(vector& fd, vector& f, vector& d) +static void split_feat_desc(vector& fd, vector& f, vector& d) { f.resize(fd.size()); d.resize(fd.size()); @@ -103,7 +104,7 @@ void split_feat_desc(vector& fd, vector& f, vector& } } -unsigned popcount(unsigned x) +static unsigned popcount(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); @@ -113,7 +114,7 @@ unsigned popcount(unsigned x) return x & 0x0000003F; } -bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f) +static bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f) { bool ret = true; float sum = 0.0f; @@ -143,6 +144,7 @@ bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float return ret; } +#endif template class GLOH : public ::testing::Test diff --git a/test/harris.cpp b/test/harris.cpp index 604e73d41c..0adde6f95d 100644 --- a/test/harris.cpp +++ b/test/harris.cpp @@ -28,7 +28,7 @@ typedef struct float f[5]; } feat_t; -bool feat_cmp(feat_t i, feat_t j) +static bool feat_cmp(feat_t i, feat_t j) { for (int k = 0; k < 5; k++) if (i.f[k] != j.f[k]) @@ -37,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j) return false; } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { diff --git a/test/main.cpp b/test/main.cpp new file mode 100644 index 0000000000..76f841f1b1 --- /dev/null +++ b/test/main.cpp @@ -0,0 +1,6 @@ +#include + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/orb.cpp b/test/orb.cpp index b499fb3824..1266f20eb6 100644 --- a/test/orb.cpp +++ b/test/orb.cpp @@ -39,7 +39,7 @@ typedef struct unsigned d[8]; } desc_t; -bool feat_cmp(feat_desc_t i, feat_desc_t j) +static bool feat_cmp(feat_desc_t i, feat_desc_t j) { for (int k = 0; k < 5; k++) if (i.f[k] != j.f[k]) @@ -48,7 +48,7 @@ bool feat_cmp(feat_desc_t i, feat_desc_t j) return true; } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, unsigned* desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, unsigned* desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -62,7 +62,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -76,7 +76,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { @@ -88,7 +88,7 @@ void array_to_feat(vector& feat, float *x, float *y, float *score, float } } -void split_feat_desc(vector& fd, vector& f, vector& d) +static void split_feat_desc(vector& fd, vector& f, vector& d) { f.resize(fd.size()); d.resize(fd.size()); @@ -103,7 +103,7 @@ void split_feat_desc(vector& fd, vector& f, vector& } } -unsigned popcount(unsigned x) +static unsigned popcount(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); diff --git a/test/reduce.cpp b/test/reduce.cpp index f71dc76b80..675ed8fc4a 100644 --- a/test/reduce.cpp +++ b/test/reduce.cpp @@ -109,16 +109,6 @@ void reduceTest(string pTestFile, int off = 0, bool isSubRef=false, const vector ASSERT_EQ(AF_SUCCESS, af_release_array(inArray)); } -vector init_subs() -{ - vector subs; - subs.push_back(af_make_seq(2, 6, 1)); - subs.push_back(af_make_seq(1, 5, 1)); - subs.push_back(af_make_seq(1, 3, 1)); - subs.push_back(af_make_seq(1, 2, 1)); - return subs; -} - template struct promote_type { typedef T type; diff --git a/test/rotate_linear.cpp b/test/rotate_linear.cpp index ce7a921260..15734a3cc2 100644 --- a/test/rotate_linear.cpp +++ b/test/rotate_linear.cpp @@ -25,7 +25,7 @@ using af::cfloat; using af::cdouble; template -class Rotate : public ::testing::Test +class RotateLinear : public ::testing::Test { public: virtual void SetUp() { @@ -40,7 +40,7 @@ class Rotate : public ::testing::Test typedef ::testing::Types TestTypes; // register the type list -TYPED_TEST_CASE(Rotate, TestTypes); +TYPED_TEST_CASE(RotateLinear, TestTypes); #define PI 3.1415926535897931f @@ -108,10 +108,10 @@ void rotateTest(string pTestFile, const unsigned resultIdx, const float angle, c if(tempArray != 0) af_release_array(tempArray); } -#define ROTATE_INIT(desc, file, resultIdx, angle, crop, recenter) \ - TYPED_TEST(Rotate, desc) \ - { \ - rotateTest(string(TEST_DIR"/rotate/"#file".test"), resultIdx, angle, crop, recenter);\ +#define ROTATE_INIT(desc, file, resultIdx, angle, crop, recenter) \ + TYPED_TEST(RotateLinear, desc) \ + { \ + rotateTest(string(TEST_DIR"/rotate/"#file".test"), resultIdx, angle, crop, recenter); \ } ROTATE_INIT(Square180NoCropRecenter , rotatelinear1, 0, 180, false, true); @@ -166,7 +166,7 @@ void rotateTest(string pTestFile, const unsigned resultIdx, const float angle, c ////////////////////////////////// CPP ////////////////////////////////////// -TEST(Rotate, CPP) +TEST(RotateLinear, CPP) { if (noDoubleTests()) return; diff --git a/test/scan.cpp b/test/scan.cpp index 386568d402..34a077f122 100644 --- a/test/scan.cpp +++ b/test/scan.cpp @@ -82,16 +82,6 @@ void scanTest(string pTestFile, int off = 0, bool isSubRef=false, const vector init_subs() -{ - vector subs; - subs.push_back(af_make_seq(2, 6, 1)); - subs.push_back(af_make_seq(1, 5, 1)); - subs.push_back(af_make_seq(1, 3, 1)); - subs.push_back(af_make_seq(1, 2, 1)); - return subs; -} - #define SCAN_TESTS(FN, TAG, Ti, To) \ TEST(Scan,Test_##FN##_##TAG) \ { \ diff --git a/test/sift_nonfree.cpp b/test/sift_nonfree.cpp index 2e069fd3d3..6776c18a86 100644 --- a/test/sift_nonfree.cpp +++ b/test/sift_nonfree.cpp @@ -38,8 +38,8 @@ typedef struct { float d[128]; } desc_t; - -bool feat_cmp(feat_desc_t i, feat_desc_t j) +#ifdef AF_BUILD_NONFREE_SIFT +static bool feat_cmp(feat_desc_t i, feat_desc_t j) { for (int k = 0; k < 5; k++) if (round(i.f[k]*1e1f) != round(j.f[k]*1e1f)) @@ -48,7 +48,7 @@ bool feat_cmp(feat_desc_t i, feat_desc_t j) return true; } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, float* desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -62,7 +62,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) +static void array_to_feat_desc(vector& feat, float* x, float* y, float* score, float* ori, float* size, vector >& desc, unsigned nfeat) { feat.resize(nfeat); for (size_t i = 0; i < feat.size(); i++) { @@ -76,7 +76,7 @@ void array_to_feat_desc(vector& feat, float* x, float* y, float* sc } } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *ori, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { @@ -88,7 +88,7 @@ void array_to_feat(vector& feat, float *x, float *y, float *score, float } } -void split_feat_desc(vector& fd, vector& f, vector& d) +static void split_feat_desc(vector& fd, vector& f, vector& d) { f.resize(fd.size()); d.resize(fd.size()); @@ -103,7 +103,7 @@ void split_feat_desc(vector& fd, vector& f, vector& } } -unsigned popcount(unsigned x) +static unsigned popcount(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); @@ -113,7 +113,7 @@ unsigned popcount(unsigned x) return x & 0x0000003F; } -bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f) +static bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float unit_thr = 1.f, float euc_thr = 1.f) { bool ret = true; float sum = 0.0f; @@ -143,6 +143,7 @@ bool compareEuclidean(dim_t desc_len, dim_t ndesc, float *cpu, float *gpu, float return ret; } +#endif template class SIFT : public ::testing::Test diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp index 289e407ad9..ed827c9da5 100644 --- a/test/sort_by_key.cpp +++ b/test/sort_by_key.cpp @@ -26,7 +26,7 @@ using af::cfloat; using af::cdouble; template -class Sort : public ::testing::Test +class SortByKey : public ::testing::Test { public: virtual void SetUp() { @@ -41,7 +41,7 @@ class Sort : public ::testing::Test typedef ::testing::Types TestTypes; // register the type list -TYPED_TEST_CASE(Sort, TestTypes); +TYPED_TEST_CASE(SortByKey, TestTypes); template void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const unsigned resultIdx1, bool isSubRef = false, const vector * seqv = NULL) @@ -104,10 +104,10 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const if(tempArray != 0) af_release_array(tempArray); } -#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1) \ - TYPED_TEST(Sort, desc) \ - { \ - sortTest(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \ +#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1) \ + TYPED_TEST(SortByKey, desc) \ + { \ + sortTest(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \ } SORT_INIT(Sort0True, sort_by_key_tiny, true, 0, 1); @@ -168,4 +168,3 @@ TEST(SortByKey, CPP) delete[] keyData; delete[] valData; } - diff --git a/test/sort_index.cpp b/test/sort_index.cpp index abe7910a58..6aa240d5a5 100644 --- a/test/sort_index.cpp +++ b/test/sort_index.cpp @@ -26,7 +26,7 @@ using af::cfloat; using af::cdouble; template -class Sort : public ::testing::Test +class SortIndex : public ::testing::Test { public: virtual void SetUp() { @@ -41,7 +41,7 @@ class Sort : public ::testing::Test typedef ::testing::Types TestTypes; // register the type list -TYPED_TEST_CASE(Sort, TestTypes); +TYPED_TEST_CASE(SortIndex, TestTypes); template void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const unsigned resultIdx1, bool isSubRef = false, const vector * seqv = NULL) @@ -102,10 +102,10 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const if(tempArray != 0) af_release_array(tempArray); } -#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1) \ - TYPED_TEST(Sort, desc) \ - { \ - sortTest(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \ +#define SORT_INIT(desc, file, dir, resultIdx0, resultIdx1) \ + TYPED_TEST(SortIndex, desc) \ + { \ + sortTest(string(TEST_DIR"/sort/"#file".test"), dir, resultIdx0, resultIdx1); \ } SORT_INIT(Sort0True, sort, true, 0, 1); diff --git a/test/susan.cpp b/test/susan.cpp index 591c2f01e5..259a319ce7 100644 --- a/test/susan.cpp +++ b/test/susan.cpp @@ -28,7 +28,7 @@ typedef struct float f[5]; } feat_t; -bool feat_cmp(feat_t i, feat_t j) +static bool feat_cmp(feat_t i, feat_t j) { for (int k = 0; k < 5; k++) if (i.f[k] != j.f[k]) @@ -37,7 +37,7 @@ bool feat_cmp(feat_t i, feat_t j) return false; } -void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) +static void array_to_feat(vector& feat, float *x, float *y, float *score, float *orientation, float *size, unsigned nfeat) { feat.resize(nfeat); for (unsigned i = 0; i < feat.size(); i++) { diff --git a/test/svd_dense.cpp b/test/svd_dense.cpp index 9d4060bd7f..7ce31e2ee5 100644 --- a/test/svd_dense.cpp +++ b/test/svd_dense.cpp @@ -35,12 +35,12 @@ typedef ::testing::Types TestTypes; TYPED_TEST_CASE(svd, TestTypes); template -double get_val(T val) +inline double get_val(T val) { return val; } -template<> double get_val(cfloat val) +template<> inline double get_val(cfloat val) { return abs(val); } diff --git a/test/testHelpers.hpp b/test/testHelpers.hpp index 2744a8d67e..83f2552e08 100644 --- a/test/testHelpers.hpp +++ b/test/testHelpers.hpp @@ -6,6 +6,8 @@ * The complete license agreement can be obtained at: * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" #include #include @@ -127,11 +129,11 @@ void readTestsFromFile(const std::string &FileName, std::vector &input } } -void readImageTests(const std::string &pFileName, - std::vector &pInputDims, - std::vector &pTestInputs, - std::vector &pTestOutSizes, - std::vector &pTestOutputs) +inline void readImageTests(const std::string &pFileName, + std::vector &pInputDims, + std::vector &pTestInputs, + std::vector &pTestOutSizes, + std::vector &pTestOutputs) { using std::vector; @@ -364,18 +366,18 @@ struct cond_type { }; template -double real(T val) { return (double)val; } +inline double real(T val) { return (double)val; } template<> -double real(af::cdouble val) { return real(val); } +inline double real(af::cdouble val) { return real(val); } template<> -double real (af::cfloat val) { return real(val); } +inline double real (af::cfloat val) { return real(val); } template -double imag(T val) { return (double)val; } +inline double imag(T val) { return (double)val; } template<> -double imag(af::cdouble val) { return imag(val); } +inline double imag(af::cdouble val) { return imag(val); } template<> -double imag (af::cfloat val) { return imag(val); } +inline double imag (af::cfloat val) { return imag(val); } template bool noDoubleTests() @@ -388,14 +390,14 @@ bool noDoubleTests() return ((isTypeDouble && !isDoubleSupported) ? true : false); } -bool noImageIOTests() +inline bool noImageIOTests() { bool ret = !af::isImageIOAvailable(); if(ret) printf("Image IO Not Configured. Test will exit\n"); return ret; } -bool noLAPACKTests() +inline bool noLAPACKTests() { bool ret = !af::isLAPACKAvailable(); if(ret) printf("LAPACK Not Configured. Test will exit\n"); @@ -450,3 +452,5 @@ af::array cpu_randu(const af::dim4 dims) return af::array(dims, (T *)&out[0]); } + +#pragma GCC diagnostic pop diff --git a/test/where.cpp b/test/where.cpp index eb21e0d6dc..37208f2ee2 100644 --- a/test/where.cpp +++ b/test/where.cpp @@ -78,17 +78,6 @@ void whereTest(string pTestFile, bool isSubRef=false, const vector seqv= if(tempArray != 0) af_release_array(tempArray); } -vector init_subs() -{ - vector subs; - subs.push_back(af_make_seq(2, 6, 1)); - subs.push_back(af_make_seq(1, 5, 1)); - subs.push_back(af_make_seq(1, 3, 1)); - subs.push_back(af_make_seq(1, 2, 1)); - return subs; -} - - #define WHERE_TESTS(T) \ TEST(Where,Test_##T) \ { \ diff --git a/test/wrap.cpp b/test/wrap.cpp index 7552400db9..091c5341c1 100644 --- a/test/wrap.cpp +++ b/test/wrap.cpp @@ -42,27 +42,27 @@ typedef ::testing::Types -double get_val(T val) +inline double get_val(T val) { return val; } -template<> double get_val(cfloat val) +template<> inline double get_val(cfloat val) { return abs(val); } -template<> double get_val(cdouble val) +template<> inline double get_val(cdouble val) { return abs(val); } -template<> double get_val(unsigned char val) +template<> inline double get_val(unsigned char val) { return ((int)(val)) % 256; } -template<> double get_val(char val) +template<> inline double get_val(char val) { return (val != 0); } From 3fc6939afe3c77c1a656367b14494cd41de8abad Mon Sep 17 00:00:00 2001 From: Mani Chandra Date: Sat, 30 Jan 2016 00:12:47 -0800 Subject: [PATCH 226/288] Fixes issues when compiling with icc --- src/api/c/assign.cpp | 1 + src/api/c/moddims.cpp | 13 +++++++++++++ src/backend/cuda/Array.cpp | 1 + src/backend/opencl/Array.cpp | 1 + 4 files changed, 16 insertions(+) diff --git a/src/api/c/assign.cpp b/src/api/c/assign.cpp index bf2c185a10..8ff37630e8 100644 --- a/src/api/c/assign.cpp +++ b/src/api/c/assign.cpp @@ -29,6 +29,7 @@ using std::swap; template Array modDims(const Array& in, const af::dim4 &newDims); + template static void assign(Array &out, const unsigned &ndims, const af_seq *index, const Array &in_) diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp index 4b7a179a95..132086a6ef 100644 --- a/src/api/c/moddims.cpp +++ b/src/api/c/moddims.cpp @@ -36,6 +36,19 @@ Array modDims(const Array& in, const af::dim4 &newDims) return Out; } +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); +template Array modDims(const Array &in, const af::dim4 &newDims); + af_err af_moddims(af_array *out, const af_array in, const unsigned ndims, const dim_t * const dims) { diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 1ca6012211..6e95dd1102 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -277,6 +277,7 @@ namespace cuda template Array::Array(af::dim4 dims, const T * const in_data, \ bool is_device, bool copy_device); \ template Array::~Array (); \ + template Node_ptr Array::getNode() const; \ template void Array::eval(); \ template void Array::eval() const; \ template void writeHostDataArray (Array &arr, const T * const data, const size_t bytes); \ diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index 7b6a26eb4d..c470a351f3 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -320,6 +320,7 @@ namespace opencl template Array createNodeArray (const dim4 &size, JIT::Node_ptr node); \ template Array::Array(af::dim4 dims, cl_mem mem, size_t src_offset, bool copy); \ template Array::~Array (); \ + template Node_ptr Array::getNode() const; \ template void Array::eval(); \ template void Array::eval() const; \ template void writeHostDataArray (Array &arr, const T * const data, const size_t bytes); \ From ee7fa33d84b5f25bf53ce65644380f6d5b01b2c4 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 1 Feb 2016 16:44:41 -0500 Subject: [PATCH 227/288] Removing unnecessary option "BUILD_GTEST" --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2cfeb18fed..8bdf93cd52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,6 @@ INCLUDE(AFInstallDirs) OPTION(BUILD_TEST "Build Tests" ON) OPTION(BUILD_EXAMPLES "Build Examples" ON) -OPTION(BUILD_GTEST "Download gtest and check for updates. Necessary if you change compilers" ON) OPTION(BUILD_CPU "Build ArrayFire with a CPU backend" ON) From 653416db6c8d6406a72a0f5e698232382cee4b3b Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 1 Feb 2016 18:31:23 -0500 Subject: [PATCH 228/288] Updating release notes for 3.3 pre-release --- docs/pages/release_notes.md | 70 +++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/docs/pages/release_notes.md b/docs/pages/release_notes.md index 4f13cc7434..1063b054e3 100644 --- a/docs/pages/release_notes.md +++ b/docs/pages/release_notes.md @@ -1,6 +1,76 @@ Release Notes {#releasenotes} ============== +v3.3.0 +============== + +Major Updates +------------- + +* CPU backend supports aysnchronous execution. +* Performance improvements to OpenCL BLAS and FFT functions. +* Improved performance of memory manager. +* Improvements to visualization functions. +* Improved sorted order for OpenCL devices. +* Integration with external OpenCL projects. + +Features +---------- + +* \ref af::getActiveBackend(): Returns the current backend being used. +* [Scatter plot](https://github.com/arrayfire/arrayfire/pull/1116) added to graphics. +* \ref af::transform() now supports perspective transformation matrices. +* \ref af::infoString(): Returns `af::info()` as a string. +* \ref af::allocHost(): Allocates memory on host. +* \ref af::freeHost(): Frees host side memory allocated by arrayfire. +* Functions specific to OpenCl backend. + * \ref afcl::addDevice(): Adds an external device and context to ArrayFire's device manager. + * \ref afcl::deleteDevice(): Removes an external device and context from ArrayFire's device manager. + * \ref afcl::setDevice(): Sets an external device and context from ArrayFire's device manager. + * \ref afcl::getDeviceType(): Gets the device type of the current device. + * \ref afcl::getPlatform(): Gets the platform of the current device. + +Bug Fixes +-------------- + +* Fixed [errors when using 3D / 4D arrays](https://github.com/arrayfire/arrayfire/pull/1251) in select and replace +* Fixed [JIT errors on AMD devices](https://github.com/arrayfire/arrayfire/pull/1238) for OpenCL backend. +* Fixed [imageio bugs](https://github.com/arrayfire/arrayfire/pull/1229) for 16 bit images. +* Fixed [bugs when loading and storing images](https://github.com/arrayfire/arrayfire/pull/1228) natively. +* Fixed [bug in FFT for NVIDIA GPUs](https://github.com/arrayfire/arrayfire/issues/615) when using OpenCL backend. + +Improvements +-------------- + +* Optionally [offload BLAS and LAPACK](https://github.com/arrayfire/arrayfire/pull/1221) functions to CPU implementations to improve performance. +* Performance improvements to the memory manager. +* Error messages are now more detailed. +* Improved sorted order for OpenCL devices. + +Examples +---------- + +* New visualization [example simulating gravity](\ref graphics/gravity_sim.cpp). + +Build +---------- + +* Support for Intel `icc` compiler +* Support to compile with Intel MKL as a BLAS and LAPACK provider + +Deprecations +----------- + +* `af_lock_device_arr` is now deprecated to be removed in v4.0.0. Use \ref af_lock_array() instead. +* `af_unlock_device_arr` is now deprecated to be removed in v4.0.0. use \ref af_unlock_array() instead. + +Documentation +-------------- + +* Fixes to documentation for matchTemplate. +* Improved documentation for deviceInfo. + + v3.2.2 ============== From fc7553df8377ee6c4b7f25878aaf6bf0727bcf6b Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 2 Feb 2016 14:31:34 -0500 Subject: [PATCH 229/288] BUGFIX: max_bytes were being set incorrectly in MemoryManager --- src/backend/MemoryManager.cpp | 21 +++++++++++++++------ src/backend/MemoryManager.hpp | 2 ++ src/backend/cpu/memory.cpp | 4 +++- src/backend/cuda/memory.cpp | 8 ++++++-- src/backend/opencl/memory.cpp | 8 ++++++-- 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index 814262829e..d82436e177 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -19,6 +19,7 @@ namespace common { +const size_t ONE_GB = 1 << 30; MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug): mem_step_size(1024), max_buffers(MAX_BUFFERS), @@ -32,17 +33,25 @@ MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug): } if (this->debug_mode) mem_step_size = 1; - static const size_t oneGB = 1 << 30; for (int n = 0; n < num_devices; n++) { - size_t memsize = getMaxMemorySize(n); + // Calling getMaxMemorySize() here calls the virtual function that returns 0 + // Call it from outside the constructor. + memory[n].max_bytes = ONE_GB; + memory[n].total_bytes = 0; + memory[n].lock_bytes = 0; + memory[n].lock_buffers = 0; + } +} + +void MemoryManager::setMaxMemorySize() +{ + for (unsigned n = 0; n < memory.size(); n++) { // Calls garbage collection when: // total_bytes > memsize * 0.75 when memsize < 4GB // total_bytes > memsize - 1 GB when memsize >= 4GB // If memsize returned 0, then use 1GB - memory[n].max_bytes = memsize == 0 ? oneGB : std::max(memsize * 0.75, (double)(memsize - oneGB)); - memory[n].total_bytes = 0; - memory[n].lock_bytes = 0; - memory[n].lock_buffers = 0; + size_t memsize = this->getMaxMemorySize(n); + memory[n].max_bytes = memsize == 0 ? ONE_GB : std::max(memsize * 0.75, (double)(memsize - ONE_GB)); } } diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp index 8bb9941b87..faae7fa609 100644 --- a/src/backend/MemoryManager.hpp +++ b/src/backend/MemoryManager.hpp @@ -63,6 +63,8 @@ class MemoryManager public: MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug); + void setMaxMemorySize(); + void *alloc(const size_t bytes); void unlock(void *ptr, bool user_unlock); diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index cf7e1ba48b..8a89cb19dc 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -56,7 +56,9 @@ size_t MemoryManager::getMaxMemorySize(int id) MemoryManager::MemoryManager() : common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_CPU_MEM_DEBUG) -{} +{ + this->setMaxMemorySize(); +} void *MemoryManager::nativeAlloc(const size_t bytes) diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index f5dc6ca048..6a947c634c 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -91,7 +91,9 @@ size_t MemoryManager::getMaxMemorySize(int id) MemoryManager::MemoryManager() : common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) -{} +{ + this->setMaxMemorySize(); +} void *MemoryManager::nativeAlloc(const size_t bytes) { @@ -126,7 +128,9 @@ size_t MemoryManagerPinned::getMaxMemorySize(int id) MemoryManagerPinned::MemoryManagerPinned() : common::MemoryManager(1, MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) -{} +{ + this->setMaxMemorySize(); +} void *MemoryManagerPinned::nativeAlloc(const size_t bytes) { diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 7054e96479..01c93bb318 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -89,7 +89,9 @@ size_t MemoryManager::getMaxMemorySize(int id) MemoryManager::MemoryManager() : common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG) -{} +{ + this->setMaxMemorySize(); +} void *MemoryManager::nativeAlloc(const size_t bytes) { @@ -128,7 +130,9 @@ size_t MemoryManagerPinned::getMaxMemorySize(int id) MemoryManagerPinned::MemoryManagerPinned() : common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG), pinned_maps(getDeviceCount()) -{} +{ + this->setMaxMemorySize(); +} void *MemoryManagerPinned::nativeAlloc(const size_t bytes) { From 5183a357535d10fef63862f74608bc5a5f82eadb Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 2 Feb 2016 19:11:02 -0500 Subject: [PATCH 230/288] Cleaning up internal API for memory functions - Also Split up device.cpp and memory.cpp --- include/af/device.h | 3 + src/api/c/device.cpp | 243 ------------------------------- src/api/c/memory.cpp | 263 ++++++++++++++++++++++++++++++++++ src/backend/MemoryManager.cpp | 9 +- src/backend/MemoryManager.hpp | 7 +- src/backend/cpu/memory.cpp | 30 ++-- src/backend/cpu/memory.hpp | 10 +- src/backend/cuda/memory.cpp | 38 +++-- src/backend/cuda/memory.hpp | 11 +- src/backend/opencl/memory.cpp | 42 +++--- src/backend/opencl/memory.hpp | 13 +- 11 files changed, 353 insertions(+), 316 deletions(-) create mode 100644 src/api/c/memory.cpp diff --git a/include/af/device.h b/include/af/device.h index c0d787ea80..b08bd519b3 100644 --- a/include/af/device.h +++ b/include/af/device.h @@ -200,6 +200,8 @@ namespace af // manager /// \param[out] lock_bytes The number of bytes in use /// \param[out] lock_buffers The number of buffers in use + /// + /// \note This function performs a synchronization operation AFAPI void deviceMemInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); @@ -213,6 +215,7 @@ namespace af // /// \ingroup device_func_mem /// + /// \note This function performs a synchronization operation AFAPI void printMemInfo(const char *msg = NULL, const int device_id = -1); #endif diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index e3ec476b93..304d0c753b 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include "err_common.hpp" #include @@ -156,245 +155,3 @@ af_err af_sync(const int device) } CATCHALL; return AF_SUCCESS; } - -af_err af_device_array(af_array *arr, const void *data, - const unsigned ndims, - const dim_t * const dims, - const af_dtype type) -{ - try { - AF_CHECK(af_init()); - - af_array res; - - DIM_ASSERT(1, ndims >= 1); - dim4 d(1, 1, 1, 1); - for(unsigned i = 0; i < ndims; i++) { - d[i] = dims[i]; - DIM_ASSERT(3, dims[i] >= 1); - } - - switch (type) { - case f32: res = getHandle(createDeviceDataArray(d, data)); break; - case f64: res = getHandle(createDeviceDataArray(d, data)); break; - case c32: res = getHandle(createDeviceDataArray(d, data)); break; - case c64: res = getHandle(createDeviceDataArray(d, data)); break; - case s32: res = getHandle(createDeviceDataArray(d, data)); break; - case u32: res = getHandle(createDeviceDataArray(d, data)); break; - case s64: res = getHandle(createDeviceDataArray(d, data)); break; - case u64: res = getHandle(createDeviceDataArray(d, data)); break; - case s16: res = getHandle(createDeviceDataArray(d, data)); break; - case u16: res = getHandle(createDeviceDataArray(d, data)); break; - case u8 : res = getHandle(createDeviceDataArray(d, data)); break; - case b8 : res = getHandle(createDeviceDataArray(d, data)); break; - default: TYPE_ERROR(4, type); - } - - std::swap(*arr, res); - } CATCHALL; - - return AF_SUCCESS; -} - -af_err af_get_device_ptr(void **data, const af_array arr) -{ - try { - af_dtype type = getInfo(arr).getType(); - - switch (type) { - //FIXME: Perform copy if memory not continuous - case f32: *data = getDevicePtr(getArray(arr)); break; - case f64: *data = getDevicePtr(getArray(arr)); break; - case c32: *data = getDevicePtr(getArray(arr)); break; - case c64: *data = getDevicePtr(getArray(arr)); break; - case s32: *data = getDevicePtr(getArray(arr)); break; - case u32: *data = getDevicePtr(getArray(arr)); break; - case s64: *data = getDevicePtr(getArray(arr)); break; - case u64: *data = getDevicePtr(getArray(arr)); break; - case s16: *data = getDevicePtr(getArray(arr)); break; - case u16: *data = getDevicePtr(getArray(arr)); break; - case u8 : *data = getDevicePtr(getArray(arr)); break; - case b8 : *data = getDevicePtr(getArray(arr)); break; - - default: TYPE_ERROR(4, type); - } - - } CATCHALL; - - return AF_SUCCESS; -} - -template -inline void lockArray(const af_array arr) -{ - memLock((const T *)getArray(arr).get()); -} - -af_err af_lock_device_ptr(const af_array arr) -{ - return af_lock_array(arr); -} - -af_err af_lock_array(const af_array arr) -{ - try { - af_dtype type = getInfo(arr).getType(); - - switch (type) { - case f32: lockArray(arr); break; - case f64: lockArray(arr); break; - case c32: lockArray(arr); break; - case c64: lockArray(arr); break; - case s32: lockArray(arr); break; - case u32: lockArray(arr); break; - case s64: lockArray(arr); break; - case u64: lockArray(arr); break; - case s16: lockArray(arr); break; - case u16: lockArray(arr); break; - case u8 : lockArray(arr); break; - case b8 : lockArray(arr); break; - default: TYPE_ERROR(4, type); - } - - } CATCHALL; - - return AF_SUCCESS; -} - -template -inline void unlockArray(const af_array arr) -{ - memUnlock((const T *)getArray(arr).get()); -} - -af_err af_unlock_device_ptr(const af_array arr) -{ - return af_unlock_array(arr); -} - -af_err af_unlock_array(const af_array arr) -{ - try { - af_dtype type = getInfo(arr).getType(); - - switch (type) { - case f32: unlockArray(arr); break; - case f64: unlockArray(arr); break; - case c32: unlockArray(arr); break; - case c64: unlockArray(arr); break; - case s32: unlockArray(arr); break; - case u32: unlockArray(arr); break; - case s64: unlockArray(arr); break; - case u64: unlockArray(arr); break; - case s16: unlockArray(arr); break; - case u16: unlockArray(arr); break; - case u8 : unlockArray(arr); break; - case b8 : unlockArray(arr); break; - default: TYPE_ERROR(4, type); - } - - } CATCHALL; - - return AF_SUCCESS; -} - - -af_err af_alloc_device(void **ptr, const dim_t bytes) -{ - try { - AF_CHECK(af_init()); - *ptr = (void *)memAlloc(bytes); - memLock((const char *)*ptr); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_alloc_pinned(void **ptr, const dim_t bytes) -{ - try { - AF_CHECK(af_init()); - *ptr = (void *)pinnedAlloc(bytes); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_free_device(void *ptr) -{ - try { - memFreeLocked((char *)ptr, true); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_free_pinned(void *ptr) -{ - try { - pinnedFree((char *)ptr); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_alloc_host(void **ptr, const dim_t bytes) -{ - try { - *ptr = malloc(bytes); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_free_host(void *ptr) -{ - try { - free(ptr); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_print_mem_info(const char *msg, const int device_id) -{ - try { - int device = device_id; - if(device == -1) { - device = getActiveDeviceId(); - } - - if(msg != NULL) ARG_ASSERT(0, strlen(msg) < 256); // 256 character limit on msg - ARG_ASSERT(1, device >= 0 && device < getDeviceCount()); - - printMemInfo(msg ? msg : "", device); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_device_gc() -{ - try { - garbageCollect(); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, - size_t *lock_bytes, size_t *lock_buffers) -{ - try { - deviceMemoryInfo(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_set_mem_step_size(const size_t step_bytes) -{ - try{ - detail::setMemStepSize(step_bytes); - } CATCHALL; - return AF_SUCCESS; -} - -af_err af_get_mem_step_size(size_t *step_bytes) -{ - try { - *step_bytes = detail::getMemStepSize(); - } CATCHALL; - return AF_SUCCESS; -} diff --git a/src/api/c/memory.cpp b/src/api/c/memory.cpp new file mode 100644 index 0000000000..098665ba03 --- /dev/null +++ b/src/api/c/memory.cpp @@ -0,0 +1,263 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "err_common.hpp" +#include + +using namespace detail; + +af_err af_device_array(af_array *arr, const void *data, + const unsigned ndims, + const dim_t * const dims, + const af_dtype type) +{ + try { + AF_CHECK(af_init()); + + af_array res; + + DIM_ASSERT(1, ndims >= 1); + dim4 d(1, 1, 1, 1); + for(unsigned i = 0; i < ndims; i++) { + d[i] = dims[i]; + DIM_ASSERT(3, dims[i] >= 1); + } + + switch (type) { + case f32: res = getHandle(createDeviceDataArray(d, data)); break; + case f64: res = getHandle(createDeviceDataArray(d, data)); break; + case c32: res = getHandle(createDeviceDataArray(d, data)); break; + case c64: res = getHandle(createDeviceDataArray(d, data)); break; + case s32: res = getHandle(createDeviceDataArray(d, data)); break; + case u32: res = getHandle(createDeviceDataArray(d, data)); break; + case s64: res = getHandle(createDeviceDataArray(d, data)); break; + case u64: res = getHandle(createDeviceDataArray(d, data)); break; + case s16: res = getHandle(createDeviceDataArray(d, data)); break; + case u16: res = getHandle(createDeviceDataArray(d, data)); break; + case u8 : res = getHandle(createDeviceDataArray(d, data)); break; + case b8 : res = getHandle(createDeviceDataArray(d, data)); break; + default: TYPE_ERROR(4, type); + } + + std::swap(*arr, res); + } CATCHALL; + + return AF_SUCCESS; +} + +af_err af_get_device_ptr(void **data, const af_array arr) +{ + try { + af_dtype type = getInfo(arr).getType(); + + switch (type) { + //FIXME: Perform copy if memory not continuous + case f32: *data = getDevicePtr(getArray(arr)); break; + case f64: *data = getDevicePtr(getArray(arr)); break; + case c32: *data = getDevicePtr(getArray(arr)); break; + case c64: *data = getDevicePtr(getArray(arr)); break; + case s32: *data = getDevicePtr(getArray(arr)); break; + case u32: *data = getDevicePtr(getArray(arr)); break; + case s64: *data = getDevicePtr(getArray(arr)); break; + case u64: *data = getDevicePtr(getArray(arr)); break; + case s16: *data = getDevicePtr(getArray(arr)); break; + case u16: *data = getDevicePtr(getArray(arr)); break; + case u8 : *data = getDevicePtr(getArray(arr)); break; + case b8 : *data = getDevicePtr(getArray(arr)); break; + + default: TYPE_ERROR(4, type); + } + + } CATCHALL; + + return AF_SUCCESS; +} + +template +inline void lockArray(const af_array arr) +{ + memLock((void *)getArray(arr).get()); +} + +af_err af_lock_device_ptr(const af_array arr) +{ + return af_lock_array(arr); +} + +af_err af_lock_array(const af_array arr) +{ + try { + af_dtype type = getInfo(arr).getType(); + + switch (type) { + case f32: lockArray(arr); break; + case f64: lockArray(arr); break; + case c32: lockArray(arr); break; + case c64: lockArray(arr); break; + case s32: lockArray(arr); break; + case u32: lockArray(arr); break; + case s64: lockArray(arr); break; + case u64: lockArray(arr); break; + case s16: lockArray(arr); break; + case u16: lockArray(arr); break; + case u8 : lockArray(arr); break; + case b8 : lockArray(arr); break; + default: TYPE_ERROR(4, type); + } + + } CATCHALL; + + return AF_SUCCESS; +} + +template +inline void unlockArray(const af_array arr) +{ + memUnlock((void *)getArray(arr).get()); +} + +af_err af_unlock_device_ptr(const af_array arr) +{ + return af_unlock_array(arr); +} + +af_err af_unlock_array(const af_array arr) +{ + try { + af_dtype type = getInfo(arr).getType(); + + switch (type) { + case f32: unlockArray(arr); break; + case f64: unlockArray(arr); break; + case c32: unlockArray(arr); break; + case c64: unlockArray(arr); break; + case s32: unlockArray(arr); break; + case u32: unlockArray(arr); break; + case s64: unlockArray(arr); break; + case u64: unlockArray(arr); break; + case s16: unlockArray(arr); break; + case u16: unlockArray(arr); break; + case u8 : unlockArray(arr); break; + case b8 : unlockArray(arr); break; + default: TYPE_ERROR(4, type); + } + + } CATCHALL; + + return AF_SUCCESS; +} + + +af_err af_alloc_device(void **ptr, const dim_t bytes) +{ + try { + AF_CHECK(af_init()); + *ptr = memAllocUser(bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_alloc_pinned(void **ptr, const dim_t bytes) +{ + try { + AF_CHECK(af_init()); + *ptr = (void *)pinnedAlloc(bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_free_device(void *ptr) +{ + try { + memFreeUser(ptr); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_free_pinned(void *ptr) +{ + try { + pinnedFree((char *)ptr); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_alloc_host(void **ptr, const dim_t bytes) +{ + try { + *ptr = malloc(bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_free_host(void *ptr) +{ + try { + free(ptr); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_print_mem_info(const char *msg, const int device_id) +{ + try { + int device = device_id; + if(device == -1) { + device = getActiveDeviceId(); + } + + if(msg != NULL) ARG_ASSERT(0, strlen(msg) < 256); // 256 character limit on msg + ARG_ASSERT(1, device >= 0 && device < getDeviceCount()); + + printMemInfo(msg ? msg : "", device); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_device_gc() +{ + try { + garbageCollect(); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_device_mem_info(size_t *alloc_bytes, size_t *alloc_buffers, + size_t *lock_bytes, size_t *lock_buffers) +{ + try { + deviceMemoryInfo(alloc_bytes, alloc_buffers, lock_bytes, lock_buffers); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_set_mem_step_size(const size_t step_bytes) +{ + try{ + detail::setMemStepSize(step_bytes); + } CATCHALL; + return AF_SUCCESS; +} + +af_err af_get_mem_step_size(size_t *step_bytes) +{ + try { + *step_bytes = detail::getMemStepSize(); + } CATCHALL; + return AF_SUCCESS; +} diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index d82436e177..5773c19de7 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -19,7 +19,6 @@ namespace common { -const size_t ONE_GB = 1 << 30; MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug): mem_step_size(1024), max_buffers(MAX_BUFFERS), @@ -115,7 +114,7 @@ void MemoryManager::unlock(void *ptr, bool user_unlock) } } -void *MemoryManager::alloc(const size_t bytes) +void *MemoryManager::alloc(const size_t bytes, bool user_lock) { lock_guard_t lock(this->memory_mutex); @@ -277,4 +276,10 @@ void MemoryManager::bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers, if (lock_bytes ) *lock_bytes = current.lock_bytes; if (lock_buffers ) *lock_buffers = current.lock_buffers; } + +unsigned MemoryManager::getMaxBuffers() +{ + return this->max_buffers; +} + } diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp index faae7fa609..a010f30064 100644 --- a/src/backend/MemoryManager.hpp +++ b/src/backend/MemoryManager.hpp @@ -19,6 +19,9 @@ namespace common typedef std::recursive_mutex mutex_t; typedef std::lock_guard lock_guard_t; +const unsigned MAX_BUFFERS = 1000; +const size_t ONE_GB = 1 << 30; + class MemoryManager { typedef struct @@ -65,7 +68,7 @@ class MemoryManager void setMaxMemorySize(); - void *alloc(const size_t bytes); + void *alloc(const size_t bytes, bool user_lock); void unlock(void *ptr, bool user_unlock); @@ -84,6 +87,8 @@ class MemoryManager size_t getMaxBytes(); + unsigned getMaxBuffers(); + void setMemStepSize(size_t new_step_size); virtual void *nativeAlloc(const size_t bytes) diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 8a89cb19dc..016428a6d9 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -55,7 +55,7 @@ size_t MemoryManager::getMaxMemorySize(int id) } MemoryManager::MemoryManager() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_CPU_MEM_DEBUG) + common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CPU_MEM_DEBUG) { this->setMaxMemorySize(); } @@ -94,6 +94,11 @@ size_t getMaxBytes() return getMemoryManager().getMaxBytes(); } +unsigned getMaxBuffers() +{ + return getMemoryManager().getMaxBuffers(); +} + void garbageCollect() { getMemoryManager().garbageCollect(); @@ -107,34 +112,34 @@ void printMemInfo(const char *msg, const int device) template T* memAlloc(const size_t &elements) { - return (T *)getMemoryManager().alloc(elements * sizeof(T)); + return (T *)getMemoryManager().alloc(elements * sizeof(T), false); } +void* memAllocUser(const size_t &bytes) +{ + return getMemoryManager().alloc(bytes, true); +} template void memFree(T *ptr) { return getMemoryManager().unlock((void *)ptr, false); } -template -void memFreeLocked(T *ptr, bool user_unlock) +void memFreeUser(void *ptr) { - getMemoryManager().unlock((void *)ptr, user_unlock); + getMemoryManager().unlock((void *)ptr, true); } -template -void memLock(const T *ptr) +void memLock(const void *ptr) { getMemoryManager().userLock((void *)ptr); } -template -void memUnlock(const T *ptr) +void memUnlock(const void *ptr) { getMemoryManager().userUnlock((void *)ptr); } - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers) { @@ -146,7 +151,7 @@ void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, template T* pinnedAlloc(const size_t &elements) { - return (T *)getMemoryManager().alloc(elements * sizeof(T)); + return (T *)getMemoryManager().alloc(elements * sizeof(T), false); } template @@ -158,9 +163,6 @@ void pinnedFree(T* ptr) #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ - template void memFreeLocked(T* ptr, bool user_unlock); \ - template void memLock(const T* ptr); \ - template void memUnlock(const T* ptr); \ template T* pinnedAlloc(const size_t &elements); \ template void pinnedFree(T* ptr); \ diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 8f61f11f7b..80ee86ddc8 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -13,22 +13,22 @@ namespace cpu { template T* memAlloc(const size_t &elements); + void *memAllocUser(const size_t &bytes); // Need these as 2 separate function and not a default argument // This is because it is used as the deleter in shared pointer // which cannot support default arguments template void memFree(T* ptr); - template void memFreeLocked(T* ptr, bool user_unlock); + void memFreeUser(void* ptr); - template void memLock(const T *ptr); - template void memUnlock(const T *ptr); + void memLock(const void *ptr); + void memUnlock(const void *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 1000; - size_t getMaxBytes(); + unsigned getMaxBuffers(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index 6a947c634c..ff62661601 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -50,11 +50,7 @@ class MemoryManager : public common::MemoryManager cuda::setDevice(n); this->garbageCollect(); } catch(AfError err) { - if(err.getError() == AF_ERR_DRIVER) { // Can happen from cudaErrorDevicesUnavailable - continue; - } else { - throw err; - } + continue; // Do not throw any errors while shutting down } } } @@ -90,7 +86,7 @@ size_t MemoryManager::getMaxMemorySize(int id) } MemoryManager::MemoryManager() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) + common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) { this->setMaxMemorySize(); } @@ -127,7 +123,7 @@ size_t MemoryManagerPinned::getMaxMemorySize(int id) } MemoryManagerPinned::MemoryManagerPinned() : - common::MemoryManager(1, MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) + common::MemoryManager(1, common::MAX_BUFFERS, AF_MEM_DEBUG || AF_CUDA_MEM_DEBUG) { this->setMaxMemorySize(); } @@ -168,6 +164,11 @@ size_t getMaxBytes() return getMemoryManager().getMaxBytes(); } +unsigned getMaxBuffers() +{ + return getMemoryManager().getMaxBuffers(); +} + void garbageCollect() { getMemoryManager().garbageCollect(); @@ -181,34 +182,34 @@ void printMemInfo(const char *msg, const int device) template T* memAlloc(const size_t &elements) { - return (T *)getMemoryManager().alloc(elements * sizeof(T)); + return (T *)getMemoryManager().alloc(elements * sizeof(T), false); } +void* memAllocUser(const size_t &bytes) +{ + return getMemoryManager().alloc(bytes, true); +} template void memFree(T *ptr) { return getMemoryManager().unlock((void *)ptr, false); } -template -void memFreeLocked(T *ptr, bool user_unlock) +void memFreeUser(void *ptr) { - getMemoryManager().unlock((void *)ptr, user_unlock); + getMemoryManager().unlock((void *)ptr, true); } -template -void memLock(const T *ptr) +void memLock(const void *ptr) { getMemoryManager().userLock((void *)ptr); } -template -void memUnlock(const T *ptr) +void memUnlock(const void *ptr) { getMemoryManager().userUnlock((void *)ptr); } - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers) { @@ -219,7 +220,7 @@ void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, template T* pinnedAlloc(const size_t &elements) { - return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T)); + return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T), false); } template @@ -231,9 +232,6 @@ void pinnedFree(T* ptr) #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ - template void memFreeLocked(T* ptr, bool user_unlock); \ - template void memLock(const T* ptr); \ - template void memUnlock(const T* ptr); \ template T* pinnedAlloc(const size_t &elements); \ template void pinnedFree(T* ptr); \ diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index 590ba3b880..9bf69df9d4 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -13,21 +13,22 @@ namespace cuda { template T* memAlloc(const size_t &elements); + void *memAllocUser(const size_t &bytes); // Need these as 2 separate function and not a default argument // This is because it is used as the deleter in shared pointer // which cannot support default arguments template void memFree(T* ptr); - template void memFreeLocked(T* ptr, bool user_unlock); - template void memLock(const T *ptr); - template void memUnlock(const T *ptr); + void memFreeUser(void* ptr); + + void memLock(const void *ptr); + void memUnlock(const void *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 1000; - size_t getMaxBytes(); + unsigned getMaxBuffers(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 01c93bb318..756d18749e 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -88,7 +88,7 @@ size_t MemoryManager::getMaxMemorySize(int id) } MemoryManager::MemoryManager() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG) + common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG) { this->setMaxMemorySize(); } @@ -128,7 +128,7 @@ size_t MemoryManagerPinned::getMaxMemorySize(int id) } MemoryManagerPinned::MemoryManagerPinned() : - common::MemoryManager(getDeviceCount(), MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG), + common::MemoryManager(getDeviceCount(), common::MAX_BUFFERS, AF_MEM_DEBUG || AF_OPENCL_MEM_DEBUG), pinned_maps(getDeviceCount()) { this->setMaxMemorySize(); @@ -184,6 +184,11 @@ size_t getMaxBytes() return getMemoryManager().getMaxBytes(); } +unsigned getMaxBuffers() +{ + return getMemoryManager().getMaxBuffers(); +} + void garbageCollect() { getMemoryManager().garbageCollect(); @@ -197,44 +202,44 @@ void printMemInfo(const char *msg, const int device) template T* memAlloc(const size_t &elements) { - return (T *)getMemoryManager().alloc(elements * sizeof(T)); + return (T *)getMemoryManager().alloc(elements * sizeof(T), false); } -cl::Buffer *bufferAlloc(const size_t &bytes) +void* memAllocUser(const size_t &bytes) { - return (cl::Buffer *)getMemoryManager().alloc(bytes); + return getMemoryManager().alloc(bytes, true); } - template void memFree(T *ptr) { return getMemoryManager().unlock((void *)ptr, false); } -void bufferFree(cl::Buffer *buf) +void memFreeUser(void *ptr) { - return getMemoryManager().unlock((void *)buf, false); + getMemoryManager().unlock((void *)ptr, true); } -template -void memFreeLocked(T *ptr, bool user_unlock) +cl::Buffer *bufferAlloc(const size_t &bytes) { - getMemoryManager().unlock((void *)ptr, user_unlock); + return (cl::Buffer *)getMemoryManager().alloc(bytes, false); } -template -void memLock(const T *ptr) +void bufferFree(cl::Buffer *buf) +{ + return getMemoryManager().unlock((void *)buf, false); +} + +void memLock(const void *ptr) { getMemoryManager().userLock((void *)ptr); } -template -void memUnlock(const T *ptr) +void memUnlock(const void *ptr) { getMemoryManager().userUnlock((void *)ptr); } - void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers) { @@ -245,7 +250,7 @@ void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, template T* pinnedAlloc(const size_t &elements) { - return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T)); + return (T *)getMemoryManagerPinned().alloc(elements * sizeof(T), false); } template @@ -257,9 +262,6 @@ void pinnedFree(T* ptr) #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ - template void memFreeLocked(T* ptr, bool user_unlock); \ - template void memLock(const T* ptr); \ - template void memUnlock(const T* ptr); \ template T* pinnedAlloc(const size_t &elements); \ template void pinnedFree(T* ptr); \ diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index ea40b4b96f..f4d06a3324 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -17,22 +17,23 @@ namespace opencl cl::Buffer *bufferAlloc(const size_t &bytes); void bufferFree(cl::Buffer *buf); - template T *memAlloc(const size_t &elements); + template T* memAlloc(const size_t &elements); + void *memAllocUser(const size_t &bytes); // Need these as 2 separate function and not a default argument // This is because it is used as the deleter in shared pointer // which cannot support default arguments template void memFree(T* ptr); - template void memFreeLocked(T* ptr, bool user_unlock); - template void memLock(const T *ptr); - template void memUnlock(const T *ptr); + void memFreeUser(void* ptr); + + void memLock(const void *ptr); + void memUnlock(const void *ptr); template T* pinnedAlloc(const size_t &elements); template void pinnedFree(T* ptr); - static const unsigned MAX_BUFFERS = 1000; - size_t getMaxBytes(); + unsigned getMaxBuffers(); void deviceMemoryInfo(size_t *alloc_bytes, size_t *alloc_buffers, size_t *lock_bytes, size_t *lock_buffers); From 82e655825f74b9faf371550a20c839fee1701564 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 2 Feb 2016 19:12:29 -0500 Subject: [PATCH 231/288] JIT evaluation can now be tweaked by environment variables --- src/backend/cpu/Array.cpp | 5 +++-- src/backend/cpu/platform.cpp | 16 ++++++++++++++++ src/backend/cpu/platform.hpp | 2 ++ src/backend/cuda/Array.cpp | 5 +++-- src/backend/cuda/platform.cpp | 17 +++++++++++++++++ src/backend/cuda/platform.hpp | 2 ++ src/backend/opencl/Array.cpp | 16 +++------------- src/backend/opencl/platform.cpp | 22 ++++++++++++++++++++++ src/backend/opencl/platform.hpp | 2 ++ 9 files changed, 70 insertions(+), 17 deletions(-) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 891604cd27..6d51f63ba0 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace cpu { @@ -159,8 +160,8 @@ createNodeArray(const dim4 &dims, Node_ptr node) n->getInfo(length, buf_count, bytes); n->reset(); - if (length > MAX_TNJ_LEN || - buf_count >= MAX_BUFFERS || + if (length > getMaxJitSize() || + buf_count >= getMaxBuffers() || bytes >= getMaxBytes()) { out.eval(); } diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index 65a5ab1faf..7e6bc81e43 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -180,6 +180,22 @@ CPUInfo::CPUInfo() namespace cpu { +unsigned getMaxJitSize() +{ + const int MAX_JIT_LEN = 20; + + static int length = 0; + if (length == 0) { + std::string env_var = getEnvVar("AF_CPU_MAX_JIT_LEN"); + if (!env_var.empty()) { + length = std::stoi(env_var); + } else { + length = MAX_JIT_LEN; + } + } + return length; +} + int getBackend() { return AF_BACKEND_CPU; diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp index 9118ade8bd..82ed42c8f9 100644 --- a/src/backend/cpu/platform.hpp +++ b/src/backend/cpu/platform.hpp @@ -35,4 +35,6 @@ namespace cpu { void sync(int device); queue& getQueue(int idx = 0); + + unsigned getMaxJitSize(); } diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 6e95dd1102..48bee655d1 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -16,6 +16,7 @@ #include #include #include +#include using af::dim4; @@ -148,8 +149,8 @@ namespace cuda n->getInfo(length, buf_count, bytes); n->resetFlags(); - if (length > MAX_JIT_LEN || - buf_count >= MAX_BUFFERS || + if (length > getMaxJitSize() || + buf_count >= getMaxBuffers() || bytes >= getMaxBytes()) { out.eval(); } diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 5e53fc0034..67f3f08428 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -261,6 +261,23 @@ string getCUDARuntimeVersion() } +unsigned getMaxJitSize() +{ + const int MAX_JIT_LEN = 20; + + static int length = 0; + if (length == 0) { + std::string env_var = getEnvVar("AF_CUDA_MAX_JIT_LEN"); + if (!env_var.empty()) { + length = std::stoi(env_var); + } else { + length = MAX_JIT_LEN; + } + } + + return length; +} + int getDeviceCount() { return DeviceManager::getInstance().nDevices; diff --git a/src/backend/cuda/platform.hpp b/src/backend/cuda/platform.hpp index 9302f4160e..6b4186b2c2 100644 --- a/src/backend/cuda/platform.hpp +++ b/src/backend/cuda/platform.hpp @@ -38,6 +38,8 @@ bool isDoubleSupported(int device); void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); +unsigned getMaxJitSize(); + int getDeviceCount(); int getActiveDeviceId(); diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index c470a351f3..178be5be32 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -18,14 +18,12 @@ #include #include #include +#include using af::dim4; namespace opencl { - - const int MAX_JIT_LEN = 20; - const int MAX_JIT_LEN_AMD = 16; //FIXME: Change this when bug is fixed using JIT::BufferNode; using JIT::Node; using JIT::Node_ptr; @@ -156,14 +154,6 @@ namespace opencl using af::dim4; - inline bool is_max_jit_len(const unsigned &len) - { - if (getActivePlatform() == AFCL_PLATFORM_AMD) { - return len >= MAX_JIT_LEN_AMD; - } - return len >= MAX_JIT_LEN; - } - template Array createNodeArray(const dim4 &dims, Node_ptr node) { @@ -177,8 +167,8 @@ namespace opencl n->getInfo(length, buf_count, bytes); n->resetFlags(); - if (is_max_jit_len(length) || - buf_count >= MAX_BUFFERS || + if (length > getMaxJitSize() || + buf_count >= getMaxBuffers() || bytes >= getMaxBytes()) { out.eval(); } diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 9dbb3ca38c..6855e79f66 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -776,6 +776,28 @@ bool synchronize_calls() { return sync; } + +unsigned getMaxJitSize() +{ + const int MAX_JIT_LEN = 20; + const int MAX_JIT_LEN_AMD = 16; //FIXME: Change this when bug is fixed + + static int length = 0; + if (length == 0) { + std::string env_var = getEnvVar("AF_OPENCL_MAX_JIT_LEN"); + if (!env_var.empty()) { + length = std::stoi(env_var); + } else { + length = MAX_JIT_LEN; + } + } + + if (getActivePlatform() == AFCL_PLATFORM_AMD) { + return std::min(length, MAX_JIT_LEN_AMD); + } + return length; +} + } using namespace opencl; diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 9b5377dc3c..4c745e0c91 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -98,6 +98,8 @@ int getDeviceCount(); int getActiveDeviceId(); +unsigned getMaxJitSize(); + const cl::Context& getContext(); cl::CommandQueue& getQueue(); From f674cdacf2bc1bc27f3f4649da2f80f35ec632d4 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 2 Feb 2016 18:42:55 -0500 Subject: [PATCH 232/288] BUGFIX: Fixing error in where for OpenCL backend - Was erroring out when no elemnts were found - Adding necessary test --- src/backend/opencl/kernel/where.hpp | 4 +++- test/where.cpp | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/backend/opencl/kernel/where.hpp b/src/backend/opencl/kernel/where.hpp index 2cbf8c1019..2b1308fcec 100644 --- a/src/backend/opencl/kernel/where.hpp +++ b/src/backend/opencl/kernel/where.hpp @@ -159,7 +159,9 @@ namespace kernel out.info.strides[k] = total; } - get_out_idx(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y); + if (total > 0) { + get_out_idx(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y); + } bufferFree(rtmp.data); bufferFree(otmp.data); diff --git a/test/where.cpp b/test/where.cpp index 37208f2ee2..08ed878aea 100644 --- a/test/where.cpp +++ b/test/where.cpp @@ -121,3 +121,10 @@ TYPED_TEST(Where, CPP) << std::endl; } } + +TEST(Where, ISSUE_1259) +{ + af::array a = af::randu(10, 10, 10); + af::array indices = af::where(a > 2); + ASSERT_EQ(indices.elements(), 0); +} From 01d819af995f6de6d18036b18aaa75f7e734384a Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 4 Feb 2016 10:28:41 +0530 Subject: [PATCH 233/288] Prevent copy assignment & construction of af::Window object Fixes #1244 --- include/af/graphics.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/af/graphics.h b/include/af/graphics.h index defdbc165d..7485686479 100644 --- a/include/af/graphics.h +++ b/include/af/graphics.h @@ -30,6 +30,8 @@ namespace af \brief Window object to render af::arrays + Windows are not CopyConstructible or CopyAssignable. + \ingroup graphics_func */ class AFAPI Window { @@ -43,6 +45,9 @@ class AFAPI Window { void initWindow(const int width, const int height, const char* const title); + Window(const Window&); // Prevent copy-construction + Window& operator=(const Window&); // Prevent assignment + public: /** Creates a window object with default width @@ -84,6 +89,7 @@ class AFAPI Window { \ingroup gfx_func_window */ Window(const af_window wnd); + /** Destroys the window handle From a1b7f8c55032d12350f9cef07a25d1b94a9e3042 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 2 Feb 2016 19:15:20 -0500 Subject: [PATCH 234/288] Changes to internal memory manager - Manager now contains list of locked and free buffers separately - Should improve performance when allocationg new buffers - Added proper documentation --- .../configuring_arrayfire_environment.md | 25 ++ src/backend/MemoryManager.cpp | 227 ++++++++++-------- src/backend/MemoryManager.hpp | 14 +- 3 files changed, 160 insertions(+), 106 deletions(-) diff --git a/docs/pages/configuring_arrayfire_environment.md b/docs/pages/configuring_arrayfire_environment.md index a9ec486d10..d554046f1e 100644 --- a/docs/pages/configuring_arrayfire_environment.md +++ b/docs/pages/configuring_arrayfire_environment.md @@ -142,3 +142,28 @@ When the environment variable is not set, it is treated to be non zero. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AF_MEM_DEBUG=1 ./myprogram ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +AF_MAX_BUFFERS {#af_max_buffers} +------------------------------------------------------------------------- + +When AF_MAX_BUFFERS is set, this environment variable specifies the maximum number of buffers allocated before garbage collection kicks in. + +Please note that the total number of buffers that can exist simultaneously can be higher than this number. This variable tells the garbage collector that it should free any available buffers immediately if the treshold is reached. + +When not set, the default value is 1000. + +AF_OPENCL_MAX_JIT_LEN {#af_opencl_max_jit_len} +------------------------------------------------------------------------------- + +When set, this environment variable specifies the maximum length of the OpenCL JIT tree after which evaluation is forced. The default value for this is 16 for AMD devices and 20 otherwise. + +AF_CUDA_MAX_JIT_LEN {#af_cuda_max_jit_len} +------------------------------------------------------------------------------- + +When set, this environment variable specifies the maximum length of the CUDA JIT tree after which evaluation is forced. The default value for this is 20. + +AF_CPU_MAX_JIT_LEN {#af_cpu_max_jit_len} +------------------------------------------------------------------------------- + +When set, this environment variable specifies the maximum length of the CPU JIT tree after which evaluation is forced. The default value for this is 20. diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index 5773c19de7..b66dfc33e7 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -26,19 +26,32 @@ MemoryManager::MemoryManager(int num_devices, unsigned MAX_BUFFERS, bool debug): debug_mode(debug) { lock_guard_t lock(this->memory_mutex); - std::string env_var = getEnvVar("AF_MEM_DEBUG"); + + for (int n = 0; n < num_devices; n++) { + // Calling getMaxMemorySize() here calls the virtual function that returns 0 + // Call it from outside the constructor. + memory[n].max_bytes = ONE_GB; + memory[n].total_bytes = 0; + memory[n].total_buffers = 0; + memory[n].lock_bytes = 0; + memory[n].lock_buffers = 0; + } + + // Check for environment variables + + std::string env_var; + + // Debug mode + env_var = getEnvVar("AF_MEM_DEBUG"); if (!env_var.empty()) { this->debug_mode = env_var[0] != '0'; } if (this->debug_mode) mem_step_size = 1; - for (int n = 0; n < num_devices; n++) { - // Calling getMaxMemorySize() here calls the virtual function that returns 0 - // Call it from outside the constructor. - memory[n].max_bytes = ONE_GB; - memory[n].total_bytes = 0; - memory[n].lock_bytes = 0; - memory[n].lock_buffers = 0; + // Max Buffer count + env_var = getEnvVar("AF_MAX_BUFFERS"); + if (!env_var.empty()) { + this->max_buffers = std::max(1, std::stoi(env_var)); } } @@ -61,28 +74,17 @@ void MemoryManager::garbageCollect() lock_guard_t lock(this->memory_mutex); memory_info& current = this->getCurrentMemoryInfo(); - for(buffer_iter iter = current.map.begin(); - iter != current.map.end(); ++iter) { - - if (!(iter->second).manager_lock) { - - if (!(iter->second).user_lock) { - if ((iter->second).bytes > 0) { - this->nativeFree(iter->first); - } - current.total_bytes -= iter->second.bytes; - } - } - } - - buffer_iter memory_curr = current.map.begin(); - buffer_iter memory_end = current.map.end(); - - while(memory_curr != memory_end) { - if (memory_curr->second.manager_lock || memory_curr->second.user_lock) { - ++memory_curr; - } else { - current.map.erase(memory_curr++); + // Return if all buffers are locked + if (current.total_buffers == current.lock_buffers) return; + + for (auto &kv : current.free_map) { + size_t num_ptrs = kv.second.size(); + //Free memory by popping the last element + for (int n = num_ptrs-1; n >= 0; n--) { + this->nativeFree(kv.second[n]); + current.total_bytes -= kv.first; + current.total_buffers--; + kv.second.pop_back(); } } } @@ -92,25 +94,47 @@ void MemoryManager::unlock(void *ptr, bool user_unlock) lock_guard_t lock(this->memory_mutex); memory_info& current = this->getCurrentMemoryInfo(); - buffer_iter iter = current.map.find((void *)ptr); + locked_iter iter = current.locked_map.find((void *)ptr); + + // Pointer not found in locked map + if (iter == current.locked_map.end()) { + // Probably came from user, just free it + this->nativeFree(ptr); + return; + } - if (iter != current.map.end()) { + if (user_unlock) { + (iter->second).user_lock = false; + } else { + (iter->second).manager_lock = false; + } - iter->second.manager_lock = false; - if ((iter->second).user_lock && !user_unlock) return; + // Return early if either one is locked + if ((iter->second).user_lock || (iter->second).manager_lock) return; - iter->second.user_lock = false; - current.lock_bytes -= iter->second.bytes; - current.lock_buffers--; + size_t bytes = iter->second.bytes; + current.lock_bytes -= iter->second.bytes; + current.lock_buffers--; - if (this->debug_mode) { - if ((iter->second).bytes > 0) { - this->nativeFree(iter->first); - } - } + current.locked_map.erase(iter); + if (this->debug_mode) { + // Just free memory in debug mode + if ((iter->second).bytes > 0) { + this->nativeFree(iter->first); + } } else { - this->nativeFree(ptr); // Free it because we are not sure what the size is + // In regular mode, move buffer to free map + free_iter fiter = current.free_map.find(bytes); + if (fiter != current.free_map.end()) { + // If found, push back + fiter->second.push_back(ptr); + } else { + // If not found, create new vector for this size + std::vector ptrs; + ptrs.push_back(ptr); + current.free_map[bytes] = ptrs; + } } } @@ -129,45 +153,41 @@ void *MemoryManager::alloc(const size_t bytes, bool user_lock) // FIXME: Add better checks for garbage collection // Perhaps look at total memory available as a metric - if (current.map.size() > this->max_buffers || - current.lock_bytes >= current.max_bytes) { - + if (current.lock_bytes >= current.max_bytes || + current.total_buffers >= this->max_buffers) { this->garbageCollect(); } - for(buffer_iter iter = current.map.begin(); - iter != current.map.end(); ++iter) { - - buffer_info info = iter->second; - - if (!info.manager_lock && - !info.user_lock && - info.bytes == alloc_bytes) { + free_iter iter = current.free_map.find(alloc_bytes); - iter->second.manager_lock = true; - current.lock_bytes += alloc_bytes; - current.lock_buffers++; - return iter->first; - } + if (iter != current.free_map.end() && !iter->second.empty()) { + ptr = iter->second.back(); + iter->second.pop_back(); } + } - // Perform garbage collection if memory can not be allocated - try { - ptr = this->nativeAlloc(alloc_bytes); - } catch (AfError &ex) { - // If out of memory, run garbage collect and try again - if (ex.getError() != AF_ERR_NO_MEM) throw; - this->garbageCollect(); - ptr = this->nativeAlloc(alloc_bytes); + // Only comes here if buffer size not found or in debug mode + if (ptr == NULL) { + // Perform garbage collection if memory can not be allocated + try { + ptr = this->nativeAlloc(alloc_bytes); + } catch (AfError &ex) { + // If out of memory, run garbage collect and try again + if (ex.getError() != AF_ERR_NO_MEM) throw; + this->garbageCollect(); + ptr = this->nativeAlloc(alloc_bytes); + } + // Increment these two only when it succeeds to come here. + current.total_bytes += alloc_bytes; + current.total_buffers += 1; } - buffer_info info = {true, false, alloc_bytes}; - current.map[ptr] = info; + locked_info info = {true, user_lock, alloc_bytes}; + current.locked_map[ptr] = info; current.lock_bytes += alloc_bytes; current.lock_buffers++; - current.total_bytes += alloc_bytes; } return ptr; } @@ -178,34 +198,22 @@ void MemoryManager::userLock(const void *ptr) lock_guard_t lock(this->memory_mutex); - buffer_iter iter = current.map.find(const_cast(ptr)); + locked_iter iter = current.locked_map.find(const_cast(ptr)); - if (iter != current.map.end()) { + if (iter != current.locked_map.end()) { iter->second.user_lock = true; } else { - buffer_info info = { true, - true, - 100 }; //This number is not relevant + locked_info info = {false, + true, + 100}; //This number is not relevant - current.map[(void *)ptr] = info; + current.locked_map[(void *)ptr] = info; } } void MemoryManager::userUnlock(const void *ptr) { - memory_info& current = this->getCurrentMemoryInfo(); - - lock_guard_t lock(this->memory_mutex); - - buffer_iter iter = current.map.find((void *)ptr); - if (iter != current.map.end()) { - iter->second.user_lock = false; - if (this->debug_mode) { - if ((iter->second).bytes > 0) { - this->nativeFree(iter->first); - } - } - } + this->unlock(const_cast(ptr), true); } size_t MemoryManager::getMemStepSize() @@ -237,32 +245,47 @@ void MemoryManager::printInfo(const char *msg, const int device) static const std::string line(head.size(), '-'); std::cout << line << std::endl << head << std::endl << line << std::endl; - for(buffer_iter iter = current.map.begin(); - iter != current.map.end(); ++iter) { - - std::string status_mngr("Unknown"); + for(auto& kv : current.locked_map) { + std::string status_mngr("Yes"); std::string status_user("Unknown"); - - if(iter->second.manager_lock) status_mngr = "Yes"; - else status_mngr = " No"; - - if(iter->second.user_lock) status_user = "Yes"; - else status_user = " No"; + if(kv.second.user_lock) status_user = "Yes"; + else status_user = " No"; std::string unit = "KB"; - double size = (double)(iter->second.bytes) / 1024; + double size = (double)(kv.second.bytes) / 1024; if(size >= 1024) { size = size / 1024; unit = "MB"; } - std::cout << "| " << std::right << std::setw(14) << iter->first << " " + std::cout << " | " << std::right << std::setw(14) << kv.first << " " << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit << " | " << std::setw(9) << status_mngr << " | " << std::setw(9) << status_user << " |" << std::endl; } + for(auto &kv : current.free_map) { + + std::string status_mngr("No"); + std::string status_user("No"); + + std::string unit = "KB"; + double size = (double)(kv.first) / 1024; + if(size >= 1024) { + size = size / 1024; + unit = "MB"; + } + + for (auto &ptr : kv.second) { + std::cout << " | " << std::right << std::setw(14) << ptr << " " + << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit + << " | " << std::setw(9) << status_mngr + << " | " << std::setw(9) << status_user + << " |" << std::endl; + } + } + std::cout << line << std::endl; } @@ -272,7 +295,7 @@ void MemoryManager::bufferInfo(size_t *alloc_bytes, size_t *alloc_buffers, lock_guard_t lock(this->memory_mutex); memory_info current = this->getCurrentMemoryInfo(); if (alloc_bytes ) *alloc_bytes = current.total_bytes; - if (alloc_buffers ) *alloc_buffers = current.map.size(); + if (alloc_buffers ) *alloc_buffers = current.total_buffers; if (lock_bytes ) *lock_bytes = current.lock_bytes; if (lock_buffers ) *lock_buffers = current.lock_buffers; } diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp index a010f30064..015fa6db3d 100644 --- a/src/backend/MemoryManager.hpp +++ b/src/backend/MemoryManager.hpp @@ -29,17 +29,23 @@ class MemoryManager bool manager_lock; bool user_lock; size_t bytes; - } buffer_info; + } locked_info; - typedef std::map buffer_t; - typedef buffer_t::iterator buffer_iter; + typedef std::map locked_t; + typedef locked_t::iterator locked_iter; + + typedef std::map >free_t; + typedef free_t::iterator free_iter; typedef struct { - buffer_t map; + locked_t locked_map; + free_t free_map; + size_t lock_bytes; size_t lock_buffers; size_t total_bytes; + size_t total_buffers; size_t max_bytes; } memory_info; From a9385003330a999b125eaf2f8d193bf78954b424 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 4 Feb 2016 01:53:08 -0500 Subject: [PATCH 235/288] Fixes to random.hpp to work in multi-threaded environment --- src/backend/cpu/kernel/random.hpp | 119 ++++++++++++++++++++++-------- src/backend/cpu/random.cpp | 40 ++-------- test/random.cpp | 4 + 3 files changed, 97 insertions(+), 66 deletions(-) diff --git a/src/backend/cpu/kernel/random.hpp b/src/backend/cpu/kernel/random.hpp index f9cb3906f7..9c59a64db9 100644 --- a/src/backend/cpu/kernel/random.hpp +++ b/src/backend/cpu/kernel/random.hpp @@ -24,6 +24,12 @@ namespace kernel using namespace std; +#if defined(_WIN32) + #define __THREAD_LOCAL static __declspec(thread) +#else + #define __THREAD_LOCAL static __thread +#endif + template using is_arithmetic_t = typename enable_if< is_arithmetic::value, function>::type; template @@ -68,74 +74,125 @@ nrand(GenType &generator) return [func] () { return T(func(), func());}; } -static mt19937 generator; -static unsigned long long gen_seed = 0; -static bool is_first = true; -#define GLOBAL 1 +mt19937& getGenerator() +{ + // FIXME: This abomination of a work around is brought to you + // by incomplete standards from Xcode and Visual Studio + // Should ideally be using thread_local on object instead of pointer + __THREAD_LOCAL mt19937 *generator = NULL; + if (generator == NULL) generator = new mt19937(); + return *generator; +} + +unsigned long long& getSeed() +{ + __THREAD_LOCAL unsigned long long gen_seed = 0; + return gen_seed; +} + +void getSeedPtr(unsigned long long *seed) +{ + *seed = getSeed(); +} + +bool& isFirst() +{ + __THREAD_LOCAL bool is_first = true; + return is_first; +} + +void setSeed(const uintl seed) +{ + getGenerator().seed(seed); + getSeed() = seed; + isFirst() = false; +} + +//FIXME: See if we can use functors instead of function pointer directly +template +struct RandomDistribution +{ + std::function func; + RandomDistribution(std::function dist_func) : func(dist_func) + { + } +}; template void randn(Array out) { - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; + __THREAD_LOCAL unsigned long long my_seed = 0; + if (isFirst()) { + my_seed = getSeed(); + setSeed(my_seed); } - static auto gen = nrand(generator); + // FIXME: This abomination of a work around is brought to you + // by incomplete standards from Xcode and Visual Studio + // Should ideally be using thread_local on object instead of pointer + __THREAD_LOCAL RandomDistribution *distPtr = NULL; - if (my_seed != gen_seed) { - gen = nrand(generator); - my_seed = gen_seed; + if (!distPtr || my_seed != getSeed()) { + if (distPtr) delete distPtr; + distPtr = new RandomDistribution(nrand(getGenerator())); + my_seed = getSeed(); } T *outPtr = out.get(); for (int i = 0; i < (int)out.elements(); i++) { - outPtr[i] = gen(); + outPtr[i] = distPtr->func(); } } template void randu(Array out) { - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; + __THREAD_LOCAL unsigned long long my_seed = 0; + if (isFirst()) { + my_seed = getSeed(); + setSeed(my_seed); } - static auto gen = urand(generator); + // FIXME: This abomination of a work around is brought to you + // by incomplete standards from Xcode and Visual Studio + // Should ideally be using thread_local on object instead of pointer + __THREAD_LOCAL RandomDistribution *distPtr = NULL; - if (my_seed != gen_seed) { - gen = urand(generator); - my_seed = gen_seed; + if (!distPtr || my_seed != getSeed()) { + if (distPtr) delete distPtr; + distPtr = new RandomDistribution(urand(getGenerator())); + my_seed = getSeed(); } T *outPtr = out.get(); for (int i = 0; i < (int)out.elements(); i++) { - outPtr[i] = gen(); + outPtr[i] = distPtr->func(); } } template<> void randu(Array out) { - static unsigned long long my_seed = 0; - if (is_first) { - setSeed(gen_seed); - my_seed = gen_seed; + __THREAD_LOCAL unsigned long long my_seed = 0; + if (isFirst()) { + my_seed = getSeed(); + setSeed(my_seed); } - static auto gen = urand(generator); + // FIXME: This abomination of a work around is brought to you + // by incomplete standards from Xcode and Visual Studio + // Should ideally be using thread_local on object instead of pointer + __THREAD_LOCAL RandomDistribution *distPtr = NULL; - if (my_seed != gen_seed) { - gen = urand(generator); - my_seed = gen_seed; + if (!distPtr || my_seed != getSeed()) { + if (distPtr) delete distPtr; + distPtr = new RandomDistribution(nrand(getGenerator())); + my_seed = getSeed(); } char *outPtr = out.get(); for (int i = 0; i < (int)out.elements(); i++) { - outPtr[i] = gen() > 0.5; + outPtr[i] = distPtr->func() > 0.5; } } diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp index 89d86c3848..06cbca34d7 100644 --- a/src/backend/cpu/random.cpp +++ b/src/backend/cpu/random.cpp @@ -39,6 +39,7 @@ INSTANTIATE_UNIFORM(uint) INSTANTIATE_UNIFORM(intl) INSTANTIATE_UNIFORM(uintl) INSTANTIATE_UNIFORM(uchar) +INSTANTIATE_UNIFORM(char) INSTANTIATE_UNIFORM(short) INSTANTIATE_UNIFORM(ushort) @@ -58,48 +59,17 @@ INSTANTIATE_NORMAL(double) INSTANTIATE_NORMAL(cfloat) INSTANTIATE_NORMAL(cdouble) -template<> -Array randu(const af::dim4 &dims) -{ - static unsigned long long my_seed = 0; - if (kernel::is_first) { - setSeed(kernel::gen_seed); - my_seed = kernel::gen_seed; - } - - static auto gen = kernel::urand(kernel::generator); - - if (my_seed != kernel::gen_seed) { - gen = kernel::urand(kernel::generator); - my_seed = kernel::gen_seed; - } - - Array outArray = createEmptyArray(dims); - auto func = [=](Array outArray) { - char *outPtr = outArray.get(); - for (int i = 0; i < (int)outArray.elements(); i++) { - outPtr[i] = gen() > 0.5; - } - }; - getQueue().enqueue(func, outArray); - - return outArray; -} - void setSeed(const uintl seed) { - auto f = [=](const uintl seed){ - kernel::generator.seed(seed); - kernel::is_first = false; - kernel::gen_seed = seed; - }; - getQueue().enqueue(f, seed); + getQueue().enqueue(kernel::setSeed, seed); } uintl getSeed() { + uintl seed = 0; + getQueue().enqueue(kernel::getSeedPtr, &seed); getQueue().sync(); - return kernel::gen_seed; + return seed; } } diff --git a/test/random.cpp b/test/random.cpp index 29f157a776..74f7e6541b 100644 --- a/test/random.cpp +++ b/test/random.cpp @@ -59,6 +59,7 @@ void randuTest(af::dim4 & dims) af_array outArray = 0; ASSERT_EQ(AF_SUCCESS, af_randu(&outArray, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits::af_type)); + ASSERT_EQ(af_sync(-1), AF_SUCCESS); if(outArray != 0) af_release_array(outArray); } @@ -69,6 +70,7 @@ void randnTest(af::dim4 &dims) af_array outArray = 0; ASSERT_EQ(AF_SUCCESS, af_randn(&outArray, dims.ndims(), dims.get(), (af_dtype) af::dtype_traits::af_type)); + ASSERT_EQ(af_sync(-1), AF_SUCCESS); if(outArray != 0) af_release_array(outArray); } @@ -124,6 +126,7 @@ void randuArgsTest() dim_t dims[] = {1, 2, 3, 0}; af_array outArray = 0; ASSERT_EQ(AF_ERR_SIZE, af_randu(&outArray, ndims, dims, (af_dtype) af::dtype_traits::af_type)); + ASSERT_EQ(af_sync(-1), AF_SUCCESS); if(outArray != 0) af_release_array(outArray); } @@ -143,6 +146,7 @@ TEST(Random, CPP) af::dim4 dims(1, 2, 3, 1); af::array out1 = af::randu(dims); af::array out2 = af::randn(dims); + af::sync(); } template From 95aaf729dfc362b08646870fa5f01d91bdebb600 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 4 Feb 2016 01:53:32 -0500 Subject: [PATCH 236/288] OpenCL JIT now launches more threads per work group for CPU devices --- src/backend/opencl/jit.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/backend/opencl/jit.cpp b/src/backend/opencl/jit.cpp index 66c7c1e9f7..d6ab240fd6 100644 --- a/src/backend/opencl/jit.cpp +++ b/src/backend/opencl/jit.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace opencl { @@ -180,13 +181,16 @@ void evalNodes(Param &out, Node *node) uint groups_1 = 1; uint num_odims = 4; + // CPUs seem to perform better with work group size 1024 + const int work_group_size = (getActiveDeviceType() == AFCL_DEVICE_TYPE_CPU) ? 1024 : 256; + while (num_odims >= 1) { if (out.info.dims[num_odims - 1] == 1) num_odims--; else break; } if (is_linear) { - local_0 = 256; + local_0 = work_group_size; uint out_elements = out.info.dims[3] * out.info.strides[3]; uint groups = divup(out_elements, local_0); @@ -194,8 +198,8 @@ void evalNodes(Param &out, Node *node) global_0 = divup(groups, global_1) * local_0; } else { - local_0 = 64; local_1 = 4; + local_0 = work_group_size / local_1; groups_0 = divup(out.info.dims[0], local_0); groups_1 = divup(out.info.dims[1], local_1); From e0879cb37f9223d68004f65b984183eb83e8ddc7 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Fri, 5 Feb 2016 01:44:16 -0500 Subject: [PATCH 237/288] FEAT: Adding functions exposing Array internals - af_create_array_with_strides - af_get_strides - af_get_offset - af_get_raw_ptr - af_is_linear - af_is_owner --- include/af/internal.h | 62 ++++++++++++ src/api/c/internal.cpp | 181 +++++++++++++++++++++++++++++++++++ src/api/cpp/internal.cpp | 63 ++++++++++++ src/api/unified/internal.cpp | 54 +++++++++++ src/backend/cpu/Array.cpp | 18 ++++ src/backend/cpu/Array.hpp | 5 + src/backend/cuda/Array.cpp | 21 ++++ src/backend/cuda/Array.hpp | 4 + src/backend/opencl/Array.cpp | 21 ++++ src/backend/opencl/Array.hpp | 4 + 10 files changed, 433 insertions(+) create mode 100644 include/af/internal.h create mode 100644 src/api/c/internal.cpp create mode 100644 src/api/cpp/internal.cpp create mode 100644 src/api/unified/internal.cpp diff --git a/include/af/internal.h b/include/af/internal.h new file mode 100644 index 0000000000..fdd0158e2e --- /dev/null +++ b/include/af/internal.h @@ -0,0 +1,62 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +#ifdef __cplusplus +namespace af +{ + class array; + + AFAPI array createArray(const void *data, const dim_t offset, + const dim4 dims, const dim4 strides, + const af::dtype ty, + const af::source location); + + AFAPI dim4 getStrides(const array &in); + + AFAPI dim_t getOffset(const array &in); + + AFAPI void *getRawPtr(const array &in); + + AFAPI bool isLinear(const array &in); + + AFAPI bool isOwner(const array &in); +} +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + + AFAPI af_err af_create_array_with_strides(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims, + const dim_t *const strides, + const af_dtype ty, + const af_source location); + + AFAPI af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array arr); + + AFAPI af_err af_get_offset(dim_t *offset, const af_array arr); + + AFAPI af_err af_get_raw_ptr(void **ptr, const af_array arr); + + AFAPI af_err af_is_linear(bool *result, const af_array arr); + + AFAPI af_err af_is_owner(bool *result, const af_array arr); + +#ifdef __cplusplus +} +#endif diff --git a/src/api/c/internal.cpp b/src/api/c/internal.cpp new file mode 100644 index 0000000000..d086d431f1 --- /dev/null +++ b/src/api/c/internal.cpp @@ -0,0 +1,181 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "err_common.hpp" +#include + +using namespace detail; + +af_err af_create_array_with_strides(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims_, + const dim_t *const strides_, + const af_dtype ty, + const af_source location) +{ + try { + + ARG_ASSERT(2, offset >= 0); + ARG_ASSERT(3, ndims >=1 && ndims <= 4); + ARG_ASSERT(4, dims_ != NULL); + ARG_ASSERT(5, strides_ != NULL); + ARG_ASSERT(5, strides_[0] == 1); + + dim4 dims(ndims, dims_); + dim4 strides(ndims, strides_); + + bool isdev = location == afDevice; + + af_array res; + AF_CHECK(af_init()); + + switch (ty) { + case f32: res = getHandle(Array(dims, strides, offset, (float *)data, isdev)); break; + case f64: res = getHandle(Array(dims, strides, offset, (double *)data, isdev)); break; + case c32: res = getHandle(Array(dims, strides, offset, (cfloat *)data, isdev)); break; + case c64: res = getHandle(Array(dims, strides, offset, (cdouble *)data, isdev)); break; + case u32: res = getHandle(Array(dims, strides, offset, (uint *)data, isdev)); break; + case s32: res = getHandle(Array(dims, strides, offset, (int *)data, isdev)); break; + case u64: res = getHandle(Array(dims, strides, offset, (uintl *)data, isdev)); break; + case s64: res = getHandle(Array(dims, strides, offset, (intl *)data, isdev)); break; + case u16: res = getHandle(Array(dims, strides, offset, (ushort *)data, isdev)); break; + case s16: res = getHandle(Array(dims, strides, offset, (short *)data, isdev)); break; + case b8 : res = getHandle(Array(dims, strides, offset, (char *)data, isdev)); break; + case u8 : res = getHandle(Array(dims, strides, offset, (uchar *)data, isdev)); break; + default: TYPE_ERROR(6, ty); + } + + std::swap(*arr, res); + } + CATCHALL; + return AF_SUCCESS; +} + +af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array in) +{ + try { + ArrayInfo info = getInfo(in); + *s0 = info.strides()[0]; + *s1 = info.strides()[1]; + *s2 = info.strides()[2]; + *s3 = info.strides()[3]; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_get_offset(dim_t *offset, const af_array arr) +{ + try { + + dim_t res = 0; + + af_dtype ty = getInfo(arr).getType(); + + switch (ty) { + case f32: res = getArray(arr).getOffset(); break; + case f64: res = getArray(arr).getOffset(); break; + case c32: res = getArray(arr).getOffset(); break; + case c64: res = getArray(arr).getOffset(); break; + case u32: res = getArray(arr).getOffset(); break; + case s32: res = getArray(arr).getOffset(); break; + case u64: res = getArray(arr).getOffset(); break; + case s64: res = getArray(arr).getOffset(); break; + case u16: res = getArray(arr).getOffset(); break; + case s16: res = getArray(arr).getOffset(); break; + case b8 : res = getArray(arr).getOffset(); break; + case u8 : res = getArray(arr).getOffset(); break; + default: TYPE_ERROR(6, ty); + } + + std::swap(*offset, res); + } + CATCHALL; + return AF_SUCCESS; + +} + +af_err af_get_raw_ptr(void **ptr, const af_array arr) +{ + try { + + void *res = NULL; + + af_dtype ty = getInfo(arr).getType(); + + switch (ty) { + case f32: res = (void *)getArray(arr).get(); break; + case f64: res = (void *)getArray(arr).get(); break; + case c32: res = (void *)getArray(arr).get(); break; + case c64: res = (void *)getArray(arr).get(); break; + case u32: res = (void *)getArray(arr).get(); break; + case s32: res = (void *)getArray(arr).get(); break; + case u64: res = (void *)getArray(arr).get(); break; + case s64: res = (void *)getArray(arr).get(); break; + case u16: res = (void *)getArray(arr).get(); break; + case s16: res = (void *)getArray(arr).get(); break; + case b8 : res = (void *)getArray(arr).get(); break; + case u8 : res = (void *)getArray(arr).get(); break; + default: TYPE_ERROR(6, ty); + } + + std::swap(*ptr, res); + } + CATCHALL; + return AF_SUCCESS; +} + +af_err af_is_linear(bool *result, const af_array arr) +{ + try { + *result = getInfo(arr).isLinear(); + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_is_owner(bool *result, const af_array arr) +{ + try { + + bool res = false; + + af_dtype ty = getInfo(arr).getType(); + + switch (ty) { + case f32: res = (void *)getArray(arr).isOwner(); break; + case f64: res = (void *)getArray(arr).isOwner(); break; + case c32: res = (void *)getArray(arr).isOwner(); break; + case c64: res = (void *)getArray(arr).isOwner(); break; + case u32: res = (void *)getArray(arr).isOwner(); break; + case s32: res = (void *)getArray(arr).isOwner(); break; + case u64: res = (void *)getArray(arr).isOwner(); break; + case s64: res = (void *)getArray(arr).isOwner(); break; + case u16: res = (void *)getArray(arr).isOwner(); break; + case s16: res = (void *)getArray(arr).isOwner(); break; + case b8 : res = (void *)getArray(arr).isOwner(); break; + case u8 : res = (void *)getArray(arr).isOwner(); break; + default: TYPE_ERROR(6, ty); + } + + std::swap(*result, res); + } + CATCHALL; + return AF_SUCCESS; +} diff --git a/src/api/cpp/internal.cpp b/src/api/cpp/internal.cpp new file mode 100644 index 0000000000..f26f9f8bb4 --- /dev/null +++ b/src/api/cpp/internal.cpp @@ -0,0 +1,63 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include "error.hpp" + +namespace af +{ + array createArray(const void *data, const dim_t offset, + const dim4 dims, const dim4 strides, + const af::dtype ty, + const af::source location) + { + af_array res; + AF_THROW(af_create_array_with_strides(&res, data, offset, + dims.ndims(), dims.get(), strides.get(), + ty, location)); + return array(res); + } + + dim4 getStrides(const array &in) + { + dim_t s0, s1, s2, s3; + AF_THROW(af_get_strides(&s0, &s1, &s2, &s3, in.get())); + return dim4(s0, s1, s2, s3); + } + + dim_t getOffset(const array &in) + { + dim_t offset; + AF_THROW(af_get_offset(&offset, in.get())); + return offset; + } + + void *getRawPtr(const array &in) + { + void *ptr = NULL; + AF_THROW(af_get_raw_ptr(&ptr, in.get())); + return ptr; + } + + bool isLinear(const array &in) + { + bool is_linear = false; + AF_THROW(af_is_linear(&is_linear, in.get())); + return is_linear; + } + + bool isOwner(const array &in) + { + bool is_owner = false; + AF_THROW(af_is_owner(&is_owner, in.get())); + return is_owner; + } + +} diff --git a/src/api/unified/internal.cpp b/src/api/unified/internal.cpp new file mode 100644 index 0000000000..7c223e741c --- /dev/null +++ b/src/api/unified/internal.cpp @@ -0,0 +1,54 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include "symbol_manager.hpp" + + +af_err af_create_array_with_strides(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims_, + const dim_t *const strides_, + const af_dtype ty, + const af_source location) +{ + return CALL(arr, data, offset, ndims, dims_, strides_, ty, location); +} + +af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array in) +{ + CHECK_ARRAYS(in); + return CALL(s0, s1, s2, s3, in); +} + +af_err af_get_offset(dim_t *offset, const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(offset, arr); +} + +af_err af_get_raw_ptr(void **ptr, const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(ptr, arr); +} + +af_err af_is_linear(bool *result, const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(result, arr); +} + +af_err af_is_owner(bool *result, const af_array arr) +{ + CHECK_ARRAYS(arr); + return CALL(result, arr); +} diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 6d51f63ba0..fbb8e10b34 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -68,6 +68,21 @@ Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, c ready(true), owner(false) { } +template +Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, + const T * const in_data, bool is_device) : + info(getActiveDeviceId(), dims, af::dim4(offset_), strides, (af_dtype)dtype_traits::af_type), + data(is_device ? (T*)in_data : memAlloc(info.elements()), memFree), + data_dims(dims), + node(), + offset(offset_), + ready(true), + owner(true) +{ + if (!is_device) { + std::copy(in_data, in_data + dims.elements(), data.get()); + } +} template void Array::eval() @@ -240,6 +255,9 @@ writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) template void Array::eval() const; \ template Array::Array(af::dim4 dims, const T * const in_data, \ bool is_device, bool copy_device); \ + template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \ + const T * const in_data, \ + bool is_device); \ template TNJ::Node_ptr Array::getNode() const; \ template void writeHostDataArray (Array &arr, const T * const data, const size_t bytes); \ template void writeDeviceDataArray (Array &arr, const void * const data, const size_t bytes); \ diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 9cd154ec50..891d867d7d 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -106,12 +106,17 @@ namespace cpu Array() = default; Array(dim4 dims); + explicit Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device=false); Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); explicit Array(af::dim4 dims, TNJ::Node_ptr n); public: + + Array(af::dim4 dims, af::dim4 strides, dim_t offset, + const T * const in_data, bool is_device = false); + void resetInfo(const af::dim4& dims) { info.resetInfo(dims); } void resetDims(const af::dim4& dims) { info.resetDims(dims); } void modDims(const af::dim4 &newDims) { info.modDims(newDims); } diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 48bee655d1..366d8e2b52 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -86,6 +86,24 @@ namespace cuda { } + template + Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, + const T * const in_data, bool is_device) : + info(getActiveDeviceId(), dims, af::dim4(offset_), strides, (af_dtype)dtype_traits::af_type), + data(is_device ? (T*)in_data : memAlloc(info.elements()), memFree), + data_dims(dims), + node(), + offset(offset_), + ready(true), + owner(true) + { + if (!is_device) { + cudaStream_t stream = getStream(getActiveDeviceId()); + CUDA_CHECK(cudaMemcpyAsync(data.get(), in_data, info.elements() * sizeof(T), + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + } template void Array::eval() @@ -275,6 +293,9 @@ namespace cuda bool copy); \ template void destroyArray (Array *A); \ template Array createNodeArray (const dim4 &size, JIT::Node_ptr node); \ + template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \ + const T * const in_data, \ + bool is_device); \ template Array::Array(af::dim4 dims, const T * const in_data, \ bool is_device, bool copy_device); \ template Array::~Array (); \ diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index ad4396b48c..b8832db1c6 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -103,12 +103,16 @@ namespace cuda bool owner; Array(af::dim4 dims); + explicit Array(af::dim4 dims, const T * const in_data, bool is_device = false, bool copy_device = false); Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); Array(Param &tmp); Array(af::dim4 dims, JIT::Node_ptr n); public: + Array(af::dim4 dims, af::dim4 strides, dim_t offset, + const T * const in_data, bool is_device = false); + void resetInfo(const af::dim4& dims) { info.resetInfo(dims); } void resetDims(const af::dim4& dims) { info.resetDims(dims); } void modDims(const af::dim4 &newDims) { info.modDims(newDims); } diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index 178be5be32..f41b2c795d 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -99,6 +99,24 @@ namespace opencl { } + template + Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, + const T * const in_data, bool is_device) : + info(getActiveDeviceId(), dims, af::dim4(offset_), strides, (af_dtype)dtype_traits::af_type), + data(is_device ? + (new cl::Buffer((cl_mem)in_data)) : + (bufferAlloc(info.elements() * sizeof(T))), bufferFree), + data_dims(dims), + node(), + offset(offset_), + ready(true), + owner(true) + { + if (!is_device) { + getQueue().enqueueWriteBuffer(*data.get(), CL_TRUE, 0, sizeof(T) * info.elements(), in_data); + } + } + template void Array::eval() @@ -308,6 +326,9 @@ namespace opencl bool copy); \ template void destroyArray (Array *A); \ template Array createNodeArray (const dim4 &size, JIT::Node_ptr node); \ + template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset, \ + const T * const in_data, \ + bool is_device); \ template Array::Array(af::dim4 dims, cl_mem mem, size_t src_offset, bool copy); \ template Array::~Array (); \ template Node_ptr Array::getNode() const; \ diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index 4c8c05a231..d1a4d973c2 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -95,6 +95,7 @@ namespace opencl bool owner; Array(af::dim4 dims); + Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); Array(Param &tmp); explicit Array(af::dim4 dims, JIT::Node_ptr n); @@ -103,6 +104,9 @@ namespace opencl public: + Array(af::dim4 dims, af::dim4 strides, dim_t offset, + const T * const in_data, bool is_device = false); + void resetInfo(const af::dim4& dims) { info.resetInfo(dims); } void resetDims(const af::dim4& dims) { info.resetDims(dims); } void modDims(const af::dim4 &newDims) { info.modDims(newDims); } From b6ccdefa24fb5aaebc1c49b4f2b54f07c6d36640 Mon Sep 17 00:00:00 2001 From: pradeep Date: Tue, 9 Feb 2016 12:17:46 +0530 Subject: [PATCH 238/288] Memory leak fix in af_median_all --- src/api/c/median.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/api/c/median.cpp b/src/api/c/median.cpp index 716df78028..50bcad25ee 100644 --- a/src/api/c/median.cpp +++ b/src/api/c/median.cpp @@ -37,12 +37,18 @@ static double median(const af_array& in) Array sortedArr = sort(input, 0); + af_array sarrHandle = getHandle(sortedArr); + double result; T resPtr[2]; af_array res = 0; - AF_CHECK(af_index(&res, getHandle(sortedArr), 1, mdSpan)); + AF_CHECK(af_index(&res, sarrHandle, 1, mdSpan)); AF_CHECK(af_get_data_ptr((void*)&resPtr, res)); + AF_CHECK(af_release_array(res)); + AF_CHECK(af_release_array(sarrHandle)); + AF_CHECK(af_release_array(temp)); + if (nElems % 2 == 1) { result = resPtr[0]; } else { @@ -53,9 +59,6 @@ static double median(const af_array& in) } } - AF_CHECK(af_release_array(res)); - AF_CHECK(af_release_array(temp)); - return result; } From d0f401e6a21028e70230753c08c47217d1880d37 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 9 Feb 2016 15:16:59 -0500 Subject: [PATCH 239/288] Compile fix for armel architecture --- src/backend/cpu/kernel/sort.hpp | 1 + src/backend/cpu/kernel/sort_by_key.hpp | 1 + src/backend/cpu/kernel/sort_index.hpp | 1 + src/backend/cpu/queue.hpp | 37 ++++++++++++++++++++++++-- src/backend/cpu/tile.cpp | 1 - src/backend/cpu/transform.cpp | 1 - src/backend/cpu/transpose.cpp | 1 - src/backend/cpu/triangle.cpp | 1 - src/backend/cpu/unwrap.cpp | 1 - src/backend/cpu/where.cpp | 1 - src/backend/cpu/wrap.cpp | 1 - 11 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp index cba07fabdf..292c6383dc 100644 --- a/src/backend/cpu/kernel/sort.hpp +++ b/src/backend/cpu/kernel/sort.hpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace cpu { diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp index 77713a7240..f9d391dc46 100644 --- a/src/backend/cpu/kernel/sort_by_key.hpp +++ b/src/backend/cpu/kernel/sort_by_key.hpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace cpu { diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp index d2de05a559..b71cc47071 100644 --- a/src/backend/cpu/kernel/sort_index.hpp +++ b/src/backend/cpu/kernel/sort_index.hpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace cpu { diff --git a/src/backend/cpu/queue.hpp b/src/backend/cpu/queue.hpp index 942ae259b1..c321644444 100644 --- a/src/backend/cpu/queue.hpp +++ b/src/backend/cpu/queue.hpp @@ -8,7 +8,40 @@ ********************************************************/ #include + +//FIXME: Is there a better way to check for std::future not being supported ? +#if defined(__GNUC__) && (__GCC_ATOMIC_INT_LOCK_FREE < 2 || __GCC_ATOMIC_POINTER_LOCK_FREE < 2) + +#include +using std::function; +#include +#define __SYNCHRONOUS_ARCH 1 +class queue_impl +{ +public: + template + void enqueue(const F func, Args... args) const { + AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL); + } + + void sync() const { + AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL); + } + + bool is_worker() const { + AF_ERROR("Incorrectly configured", AF_ERR_INTERNAL); + return false; + } + +}; + +#else + #include +#define __SYNCHRONOUS_ARCH 0 +typedef async_queue queue_impl; + +#endif #pragma once @@ -18,7 +51,7 @@ namespace cpu { class queue { public: queue() - : sync_calls( getEnvVar("AF_SYNCHRONOUS_CALLS") == "1") {} + : sync_calls( __SYNCHRONOUS_ARCH == 1 || getEnvVar("AF_SYNCHRONOUS_CALLS") == "1") {} template void enqueue(const F func, Args... args) { @@ -40,7 +73,7 @@ class queue { private: const bool sync_calls; - async_queue aQueue; + queue_impl aQueue; }; } diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp index 6526917d3a..0fe52c6398 100644 --- a/src/backend/cpu/tile.cpp +++ b/src/backend/cpu/tile.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include namespace cpu diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp index b2ab8dba79..3a76fb2f24 100644 --- a/src/backend/cpu/transform.cpp +++ b/src/backend/cpu/transform.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include "transform_interp.hpp" #include diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp index 32663e1f94..a6d410757b 100644 --- a/src/backend/cpu/transpose.cpp +++ b/src/backend/cpu/transpose.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp index 2a9553c83a..57f61b1331 100644 --- a/src/backend/cpu/triangle.cpp +++ b/src/backend/cpu/triangle.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include namespace cpu diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp index 1aa37a4762..d19286f496 100644 --- a/src/backend/cpu/unwrap.cpp +++ b/src/backend/cpu/unwrap.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include namespace cpu diff --git a/src/backend/cpu/where.cpp b/src/backend/cpu/where.cpp index 018cbdfc36..249327163d 100644 --- a/src/backend/cpu/where.cpp +++ b/src/backend/cpu/where.cpp @@ -17,7 +17,6 @@ #include #include #include -#include using af::dim4; diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp index 07487e0d68..8e0f6fe2f7 100644 --- a/src/backend/cpu/wrap.cpp +++ b/src/backend/cpu/wrap.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include namespace cpu From 4d7b37a57a01369fdf1ac1b3efc71e746c42d140 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 9 Feb 2016 15:33:00 -0500 Subject: [PATCH 240/288] Adding cmake option to disable async queues --- src/backend/cpu/CMakeLists.txt | 6 ++++++ src/backend/cpu/queue.hpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index 5dee6de3f5..8ada1d6935 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -3,6 +3,12 @@ ADD_DEFINITIONS(-DAF_CPU) FIND_PACKAGE(CBLAS REQUIRED) +OPTION(BUILD_CPU_ASYNC "Build CPU backend with ASYNC support" ON) + +IF (NOT ${BUILD_CPU_ASYNC}) + ADD_DEFINITIONS(-DAF_DISABLE_CPU_ASYNC) +ENDIF() + IF(USE_CPU_F77_BLAS) MESSAGE("Using F77 BLAS") ADD_DEFINITIONS(-DUSE_F77_BLAS) diff --git a/src/backend/cpu/queue.hpp b/src/backend/cpu/queue.hpp index c321644444..6d32b85a65 100644 --- a/src/backend/cpu/queue.hpp +++ b/src/backend/cpu/queue.hpp @@ -10,7 +10,7 @@ #include //FIXME: Is there a better way to check for std::future not being supported ? -#if defined(__GNUC__) && (__GCC_ATOMIC_INT_LOCK_FREE < 2 || __GCC_ATOMIC_POINTER_LOCK_FREE < 2) +#if defined(AF_DISABLE_CPU_ASYNC) || (defined(__GNUC__) && (__GCC_ATOMIC_INT_LOCK_FREE < 2 || __GCC_ATOMIC_POINTER_LOCK_FREE < 2)) #include using std::function; From 16fd976597dffa12ae6793f87a192965c8c113db Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 9 Feb 2016 15:42:51 -0500 Subject: [PATCH 241/288] Changes to remove unneeded font --- assets | 2 +- docs/arrayfire.css | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/assets b/assets index 8030a5c626..f16f8bf74f 160000 --- a/assets +++ b/assets @@ -1 +1 @@ -Subproject commit 8030a5c626777a5b3f46b319dd4d1723eca4b0f9 +Subproject commit f16f8bf74fe4a255db05884cfff8f5cb0e6e8e09 diff --git a/docs/arrayfire.css b/docs/arrayfire.css index 75dba64e3a..e4fe2860be 100644 --- a/docs/arrayfire.css +++ b/docs/arrayfire.css @@ -52,12 +52,6 @@ a.codeRef, a.codeRef:visited, a.lineRef, a.lineRef:visited color : #4665A2; } -@font-face -{ - font-family : prototype; - src : url('Prototype.ttf'); -} - /*image and image groups*/ div.image_group { @@ -96,7 +90,6 @@ div.support * #under_logo { - font-family : prototype; font-size : 2em; max-width : 25px; color : #000000; @@ -104,7 +97,6 @@ div.support * #projectbrief { - font-family : prototype; color : #555555 } @@ -121,7 +113,6 @@ div.support * #projectname { - font-family : prototype; font-size : 3em; max-width : 25px; color : #555555 From 2325ca2134563896221656c25273026e9486b245 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 9 Feb 2016 16:05:48 -0500 Subject: [PATCH 242/288] Fixing memory leak in plot3 --- src/api/c/plot3.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/api/c/plot3.cpp b/src/api/c/plot3.cpp index 4e8742458a..63920bb3d6 100644 --- a/src/api/c/plot3.cpp +++ b/src/api/c/plot3.cpp @@ -48,13 +48,13 @@ fg::Plot3* setup_plot3(const af_array P, fg::PlotType ptype, fg::MarkerType mtyp T max[3], min[3]; if(P_dims[0] == 3) { - af_get_data_ptr(max, getHandle(reduce(pIn, 1))); - af_get_data_ptr(min, getHandle(reduce(pIn, 1))); + copyData(max, reduce(pIn, 1)); + copyData(min, reduce(pIn, 1)); } if(P_dims[1] == 3) { - af_get_data_ptr(max, getHandle(reduce(pIn, 0))); - af_get_data_ptr(min, getHandle(reduce(pIn, 0))); + copyData(max, reduce(pIn, 0)); + copyData(min, reduce(pIn, 0)); } ForgeManager& fgMngr = ForgeManager::getInstance(); From 05296fba34458aaa9506e6e4d1339328c42094b3 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 9 Feb 2016 16:19:20 -0500 Subject: [PATCH 243/288] Cleaning up code in plot3 --- src/api/c/plot3.cpp | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/api/c/plot3.cpp b/src/api/c/plot3.cpp index 63920bb3d6..2e18251b45 100644 --- a/src/api/c/plot3.cpp +++ b/src/api/c/plot3.cpp @@ -46,16 +46,13 @@ fg::Plot3* setup_plot3(const af_array P, fg::PlotType ptype, fg::MarkerType mtyp P_dims = pIn.dims(); } - T max[3], min[3]; - if(P_dims[0] == 3) { - copyData(max, reduce(pIn, 1)); - copyData(min, reduce(pIn, 1)); + if(P_dims[1] == 3){ + pIn = transpose(pIn, false); } - if(P_dims[1] == 3) { - copyData(max, reduce(pIn, 0)); - copyData(min, reduce(pIn, 0)); - } + T max[3], min[3]; + copyData(max, reduce(pIn, 1)); + copyData(min, reduce(pIn, 1)); ForgeManager& fgMngr = ForgeManager::getInstance(); fg::Plot3* plot3 = fgMngr.getPlot3(P_dims.elements()/3, getGLType(), ptype, mtype); @@ -64,12 +61,7 @@ fg::Plot3* setup_plot3(const af_array P, fg::PlotType ptype, fg::MarkerType mtyp max[1], min[1], max[2], min[2]); plot3->setAxesTitles("X Axis", "Y Axis", "Z Axis"); - - if(P_dims[1] == 3){ - pIn = transpose(pIn, false); - } copy_plot3(pIn, plot3); - return plot3; } From ff275362b8c80b9352b5766c547bde0416a51ecb Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 9 Feb 2016 16:19:38 -0500 Subject: [PATCH 244/288] Bugfixes, code clean up of plot - Fixes issues where X and Y indices are non column vectors - Avoids reorder by using row vectors --- src/api/c/plot.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/api/c/plot.cpp b/src/api/c/plot.cpp index 273d922f12..a812947228 100644 --- a/src/api/c/plot.cpp +++ b/src/api/c/plot.cpp @@ -39,14 +39,19 @@ fg::Plot* setup_plot(const af_array X, const af_array Y, fg::PlotType type, fg:: dim4 rdims(1, 0, 2, 3); - Array Z = join(1, xIn, yIn); - Array P = reorder(Z, rdims); + dim_t elements = xIn.elements(); + dim4 rowDims = dim4(1, elements, 1, 1); - ArrayInfo Xinfo = getInfo(X); - af::dim4 X_dims = Xinfo.dims(); + // Force the vectors to be row vectors + // This ensures we can use join(0,..) and skip reorder + xIn.modDims(rowDims); + yIn.modDims(rowDims); + + // join along first dimension, skip reorder + Array P = join(0, xIn, yIn); ForgeManager& fgMngr = ForgeManager::getInstance(); - fg::Plot* plot = fgMngr.getPlot(X_dims.elements(), getGLType(), type, marker); + fg::Plot* plot = fgMngr.getPlot(elements, getGLType(), type, marker); plot->setColor(1.0, 0.0, 0.0); plot->setAxesLimits(xmax, xmin, ymax, ymin); plot->setAxesTitles("X Axis", "Y Axis"); From 213c8e6c3711ed55420fcbb6267f7334cddaf43a Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 9 Feb 2016 16:39:20 -0500 Subject: [PATCH 245/288] Clean up of surface() - Avoids unnecessary reorders by transposing vectors (more efficient) --- src/api/c/surface.cpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/api/c/surface.cpp b/src/api/c/surface.cpp index 7db8441163..2394f5f96c 100644 --- a/src/api/c/surface.cpp +++ b/src/api/c/surface.cpp @@ -49,21 +49,29 @@ fg::Surface* setup_surface(const af_array xVals, const af_array yVals, const af_ af::dim4 Y_dims = Yinfo.dims(); af::dim4 Z_dims = Zinfo.dims(); - dim4 rdims(1, 0, 2, 3); - dim4 x_tdims(1, Y_dims[0], 1, 1); - dim4 y_tdims(1, X_dims[0], 1, 1); if(Xinfo.isVector()){ + // Convert xIn is a column vector + xIn.modDims(xIn.elements()); + // Now tile along second dimension + dim4 x_tdims(1, Y_dims[0], 1, 1); xIn = tile(xIn, x_tdims); + + // Convert yIn to a row vector + yIn.modDims(af::dim4(1, yIn.elements())); + // Now tile along first dimension + dim4 y_tdims(X_dims[0], 1, 1, 1); yIn = tile(yIn, y_tdims); - yIn = reorder(yIn, rdims); } - xIn.modDims(xIn.elements()); - yIn.modDims(yIn.elements()); - zIn.modDims(zIn.elements()); - Array Z = join(1, join(1, xIn, yIn), zIn); - Z = reorder(Z, rdims); - Z.modDims(Z.elements()); + // Flatten xIn, yIn and zIn into row vectors + dim4 rowDims = dim4(1, zIn.elements()); + xIn.modDims(rowDims); + yIn.modDims(rowDims); + zIn.modDims(rowDims); + + // Now join along first dimension, skip reorder + std::vector > inputs{xIn, yIn, zIn}; + Array Z = join(0, inputs); ForgeManager& fgMngr = ForgeManager::getInstance(); fg::Surface* surface = fgMngr.getSurface(Z_dims[0], Z_dims[1], getGLType()); From d7d79af742ba999f44c5c8b4275cae0dd5ae8c26 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Wed, 10 Feb 2016 23:30:55 -0500 Subject: [PATCH 246/288] BUGFIX: Fixing offsets when writing to Arrays for CPU and CUDA backends --- src/backend/cpu/Array.cpp | 4 ++-- src/backend/cuda/Array.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index fbb8e10b34..0db8a8203b 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -227,7 +227,7 @@ writeHostDataArray(Array &arr, const T * const data, const size_t bytes) if(!arr.isOwner()) { arr = createEmptyArray(arr.dims()); } - memcpy(arr.get() + arr.getOffset(), data, bytes); + memcpy(arr.get(), data, bytes); } template @@ -237,7 +237,7 @@ writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) if(!arr.isOwner()) { arr = createEmptyArray(arr.dims()); } - memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes); + memcpy(arr.get(), (const T * const)data, bytes); } #define INSTANTIATE(T) \ diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 366d8e2b52..c44db357e7 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -257,7 +257,7 @@ namespace cuda T *ptr = arr.get(); - CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data, bytes, cudaMemcpyHostToDevice, + CUDA_CHECK(cudaMemcpyAsync(ptr, data, bytes, cudaMemcpyHostToDevice, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); @@ -274,7 +274,7 @@ namespace cuda T *ptr = arr.get(); - CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data, + CUDA_CHECK(cudaMemcpyAsync(ptr, data, bytes, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); From 1b623a01721ba9ee4857fa7a3e88f6b0925fafe1 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 11 Feb 2016 00:09:47 -0500 Subject: [PATCH 247/288] Reorganizing offset to be inside ArrayInfo - Removed unnecessary dim_offset --- src/api/c/internal.cpp | 21 +---------- src/api/c/print.cpp | 3 +- src/backend/ArrayInfo.cpp | 9 ----- src/backend/ArrayInfo.hpp | 14 +++---- src/backend/cpu/Array.cpp | 43 +++++++++++---------- src/backend/cpu/Array.hpp | 8 ++-- src/backend/cpu/exampleFunction.cpp | 3 +- src/backend/cuda/Array.cpp | 52 ++++++++++++++------------ src/backend/cuda/Array.hpp | 8 ++-- src/backend/opencl/Array.cpp | 58 +++++++++++++++-------------- src/backend/opencl/Array.hpp | 6 +-- 11 files changed, 100 insertions(+), 125 deletions(-) diff --git a/src/api/c/internal.cpp b/src/api/c/internal.cpp index d086d431f1..d5f449e7ac 100644 --- a/src/api/c/internal.cpp +++ b/src/api/c/internal.cpp @@ -84,26 +84,7 @@ af_err af_get_offset(dim_t *offset, const af_array arr) { try { - dim_t res = 0; - - af_dtype ty = getInfo(arr).getType(); - - switch (ty) { - case f32: res = getArray(arr).getOffset(); break; - case f64: res = getArray(arr).getOffset(); break; - case c32: res = getArray(arr).getOffset(); break; - case c64: res = getArray(arr).getOffset(); break; - case u32: res = getArray(arr).getOffset(); break; - case s32: res = getArray(arr).getOffset(); break; - case u64: res = getArray(arr).getOffset(); break; - case s64: res = getArray(arr).getOffset(); break; - case u16: res = getArray(arr).getOffset(); break; - case s16: res = getArray(arr).getOffset(); break; - case b8 : res = getArray(arr).getOffset(); break; - case u8 : res = getArray(arr).getOffset(); break; - default: TYPE_ERROR(6, ty); - } - + dim_t res = getInfo(arr).getOffset(); std::swap(*offset, res); } CATCHALL; diff --git a/src/api/c/print.cpp b/src/api/c/print.cpp index 181dd3505f..b243491832 100644 --- a/src/api/c/print.cpp +++ b/src/api/c/print.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -69,7 +70,7 @@ static void print(const char *exp, af_array arr, const int precision, std::ostre os << "[" << info.dims() << "]\n"; #ifndef NDEBUG - os <<" Offsets: [" << info.offsets() << "]" << std::endl; + os <<" Offset: " << info.getOffset() << std::endl; os <<" Strides: [" << info.strides() << "]" << std::endl; #endif diff --git a/src/backend/ArrayInfo.cpp b/src/backend/ArrayInfo.cpp index 219bc1991c..43d2627a84 100644 --- a/src/backend/ArrayInfo.cpp +++ b/src/backend/ArrayInfo.cpp @@ -18,15 +18,6 @@ using af::dim4; -dim_t -calcOffset(const af::dim4 &strides, const af::dim4 &offsets) -{ - dim_t offset = 0; - for (int i = 0; i < 4; i++) offset += offsets[i] * strides[i]; - return offset; -} - - const ArrayInfo& getInfo(af_array arr) { diff --git a/src/backend/ArrayInfo.hpp b/src/backend/ArrayInfo.hpp index ca6fcd394c..38e5ea61ab 100644 --- a/src/backend/ArrayInfo.hpp +++ b/src/backend/ArrayInfo.hpp @@ -16,9 +16,6 @@ #include #include -dim_t -calcOffset(const af::dim4 &strides, const af::dim4 &offsets); - af::dim4 calcStrides(const af::dim4 &parentDim); @@ -48,14 +45,15 @@ class ArrayInfo int devId; af_dtype type; af::dim4 dim_size; - af::dim4 dim_offsets, dim_strides; + dim_t offset; + af::dim4 dim_strides; public: - ArrayInfo(int id, af::dim4 size, af::dim4 offset, af::dim4 stride, af_dtype af_type): + ArrayInfo(int id, af::dim4 size, dim_t offset_, af::dim4 stride, af_dtype af_type): devId(id), type(af_type), dim_size(size), - dim_offsets(offset), + offset(offset_), dim_strides(stride) { af_init(); @@ -77,7 +75,7 @@ class ArrayInfo const af_dtype& getType() const { return type; } - const af::dim4& offsets() const { return dim_offsets; } + dim_t getOffset() const { return offset; } const af::dim4& strides() const { return dim_strides; } @@ -97,7 +95,7 @@ class ArrayInfo { dim_size = dims; dim_strides = calcStrides(dims); - dim_offsets = af::dim4(0,0,0,0); + offset = 0; } void resetDims(const af::dim4& dims) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 0db8a8203b..1b6098df41 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -33,16 +33,16 @@ using af::dim4; template Array::Array(dim4 dims): - info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(memAlloc(dims.elements()), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { } template Array::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device): - info(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data((is_device & !copy_device) ? (T*)in_data : memAlloc(dims.elements()), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); static_assert(offsetof(Array, info) == 0, "Array::info must be the first member variable of Array"); @@ -53,29 +53,27 @@ Array::Array(dim4 dims, const T * const in_data, bool is_device, bool copy_de template Array::Array(af::dim4 dims, TNJ::Node_ptr n) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(), data_dims(dims), - node(n), offset(0), ready(false), owner(true) + node(n), ready(false), owner(true) { } template -Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) : - info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits::af_type), +Array::Array(const Array& parent, const dim4 &dims, const dim_t &offset_, const dim4 &strides) : + info(parent.getDevId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), data(parent.getData()), data_dims(parent.getDataDims()), node(), - offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), ready(true), owner(false) { } template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, const T * const in_data, bool is_device) : - info(getActiveDeviceId(), dims, af::dim4(offset_), strides, (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), data(is_device ? (T*)in_data : memAlloc(info.elements()), memFree), data_dims(dims), node(), - offset(offset_), ready(true), owner(true) { @@ -119,7 +117,7 @@ Node_ptr Array::getNode() const BufferNode *buf_node = new BufferNode(data, bytes, - offset, + getOffset(), dims().get(), strides().get(), isLinear()); @@ -194,18 +192,23 @@ Array createSubArray(const Array& parent, dim4 dDims = parent.getDataDims(); dim4 pDims = parent.dims(); - dim4 dims = toDims (index, pDims); - dim4 offset = toOffset(index, dDims); - dim4 stride = toStride (index, dDims); + dim4 dims = toDims (index, pDims); + dim4 strides = toStride (index, dDims); - Array out = Array(parent, dims, offset, stride); + // Find total offsets after indexing + dim4 offsets = toOffset(index, pDims); + dim4 parent_strides = parent.strides(); + dim_t offset = parent.getOffset(); + for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i]; + + Array out = Array(parent, dims, offset, strides); if (!copy) return out; - if (stride[0] != 1 || - stride[1] < 0 || - stride[2] < 0 || - stride[3] < 0) { + if (strides[0] != 1 || + strides[1] < 0 || + strides[2] < 0 || + strides[3] < 0) { out = copyArray(out); } diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 891d867d7d..eb17852c27 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -100,7 +100,6 @@ namespace cpu af::dim4 data_dims; TNJ::Node_ptr node; - dim_t offset; bool ready; bool owner; @@ -108,7 +107,7 @@ namespace cpu Array(dim4 dims); explicit Array(dim4 dims, const T * const in_data, bool is_device, bool copy_device=false); - Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); + Array(const Array& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride); explicit Array(af::dim4 dims, TNJ::Node_ptr n); public: @@ -127,7 +126,6 @@ namespace cpu RET_TYPE NAME() const { return info.NAME(); } INFO_FUNC(const af_dtype& ,getType) - INFO_FUNC(const af::dim4& ,offsets) INFO_FUNC(const af::dim4& ,strides) INFO_FUNC(size_t ,elements) INFO_FUNC(size_t ,ndims) @@ -165,7 +163,7 @@ namespace cpu void eval(); void eval() const; - dim_t getOffset() const { return offset; } + dim_t getOffset() const { return info.getOffset(); } shared_ptr getData() const {return data; } dim4 getDataDims() const @@ -197,7 +195,7 @@ namespace cpu const T* get(bool withOffset = true) const { if (!isReady()) eval(); - return data.get() + (withOffset ? offset : 0); + return data.get() + (withOffset ? getOffset() : 0); } int useCount() const diff --git a/src/backend/cpu/exampleFunction.cpp b/src/backend/cpu/exampleFunction.cpp index d45b8a28ec..0eb86462e1 100644 --- a/src/backend/cpu/exampleFunction.cpp +++ b/src/backend/cpu/exampleFunction.cpp @@ -44,7 +44,7 @@ Array exampleFunction(const Array &in, const af_someenum_t method) //dim4 in_dims = in.dims(); // you can retrieve dimensions - //dim4 in_offsets = in.offsets(); // you can retrieve offsets - used when given array + //dim_t in_offset = in.getOffset(); // you can retrieve the offset - used when given array // is an sub-array pointing to some other array and // doesn't have memory of its own @@ -77,4 +77,3 @@ INSTANTIATE(cfloat) INSTANTIATE(cdouble) } - diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index c44db357e7..370e8eca31 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -30,17 +30,17 @@ namespace cuda template Array::Array(af::dim4 dims) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(memAlloc(dims.elements()), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) {} template Array::Array(af::dim4 dims, const T * const in_data, bool is_device, bool copy_device) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(((is_device & !copy_device) ? (T *)in_data : memAlloc(dims.elements())), memFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { #if __cplusplus > 199711L static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); @@ -58,42 +58,41 @@ namespace cuda } template - Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) : - info(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits::af_type), + Array::Array(const Array& parent, const dim4 &dims, const dim_t &offset_, const dim4 &strides) : + info(parent.getDevId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), data(parent.getData()), data_dims(parent.getDataDims()), node(), - offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), ready(true), owner(false) { } template Array::Array(Param &tmp) : - info(getActiveDeviceId(), af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3]), - af::dim4(0, 0, 0, 0), - af::dim4(tmp.strides[0], tmp.strides[1], tmp.strides[2], tmp.strides[3]), - (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), + af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3]), + 0, + af::dim4(tmp.strides[0], tmp.strides[1], tmp.strides[2], tmp.strides[3]), + (af_dtype)dtype_traits::af_type), data(tmp.ptr, memFree), data_dims(af::dim4(tmp.dims[0], tmp.dims[1], tmp.dims[2], tmp.dims[3])), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { } template Array::Array(af::dim4 dims, JIT::Node_ptr n) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(), data_dims(dims), - node(n), offset(0), ready(false), owner(true) + node(n), ready(false), owner(true) { } template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, const T * const in_data, bool is_device) : - info(getActiveDeviceId(), dims, af::dim4(offset_), strides, (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), data(is_device ? (T*)in_data : memAlloc(info.elements()), memFree), data_dims(dims), node(), - offset(offset_), ready(true), owner(true) { @@ -216,18 +215,23 @@ namespace cuda dim4 dDims = parent.getDataDims(); dim4 pDims = parent.dims(); - dim4 dims = toDims (index, pDims); - dim4 offset = toOffset(index, dDims); - dim4 stride = toStride (index, dDims); + dim4 dims = toDims (index, pDims); + dim4 strides = toStride (index, dDims); - Array out = Array(parent, dims, offset, stride); + // Find total offsets after indexing + dim4 offsets = toOffset(index, pDims); + dim4 parent_strides = parent.strides(); + dim_t offset = parent.getOffset(); + for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i]; + + Array out = Array(parent, dims, offset, strides); if (!copy) return out; - if (stride[0] != 1 || - stride[1] < 0 || - stride[2] < 0 || - stride[3] < 0) { + if (strides[0] != 1 || + strides[1] < 0 || + strides[2] < 0 || + strides[3] < 0) { out = copyArray(out); } diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index b8832db1c6..c6cdd2121d 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -98,14 +98,13 @@ namespace cuda af::dim4 data_dims; JIT::Node_ptr node; - dim_t offset; bool ready; bool owner; Array(af::dim4 dims); explicit Array(af::dim4 dims, const T * const in_data, bool is_device = false, bool copy_device = false); - Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); + Array(const Array& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride); Array(Param &tmp); Array(af::dim4 dims, JIT::Node_ptr n); public: @@ -123,7 +122,6 @@ namespace cuda RET_TYPE NAME() const { return info.NAME(); } INFO_FUNC(const af_dtype& ,getType) - INFO_FUNC(const af::dim4& ,offsets) INFO_FUNC(const af::dim4& ,strides) INFO_FUNC(size_t ,elements) INFO_FUNC(size_t ,ndims) @@ -160,7 +158,7 @@ namespace cuda void eval(); void eval() const; - dim_t getOffset() const { return offset; } + dim_t getOffset() const { return info.getOffset(); } shared_ptr getData() const { return data; } dim4 getDataDims() const @@ -193,7 +191,7 @@ namespace cuda const T* get(bool withOffset = true) const { if (!isReady()) eval(); - return data.get() + (withOffset ? offset : 0); + return data.get() + (withOffset ? getOffset() : 0); } int useCount() const diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index f41b2c795d..fb3e63beaf 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -30,28 +30,28 @@ namespace opencl template Array::Array(af::dim4 dims) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(bufferAlloc(info.elements() * sizeof(T)), bufferFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { } template Array::Array(af::dim4 dims, JIT::Node_ptr n) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(), data_dims(dims), - node(n), offset(0), ready(false), owner(true) + node(n), ready(false), owner(true) { } template Array::Array(af::dim4 dims, const T * const in_data) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(bufferAlloc(info.elements()*sizeof(T)), bufferFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { static_assert(std::is_standard_layout>::value, "Array must be a standard layout type"); static_assert(offsetof(Array, info) == 0, "Array::info must be the first member variable of Array"); @@ -60,10 +60,10 @@ namespace opencl template Array::Array(af::dim4 dims, cl_mem mem, size_t src_offset, bool copy) : - info(getActiveDeviceId(), dims, af::dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits::af_type), data(copy ? bufferAlloc(info.elements() * sizeof(T)) : new cl::Buffer(mem), bufferFree), data_dims(dims), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { if (copy) { clRetainMemObject(mem); @@ -75,12 +75,11 @@ namespace opencl } template - Array::Array(const Array& parent, const dim4 &dims, const dim4 &offsets, const dim4 &stride) : - info(parent.getDevId(), dims, offsets, stride, (af_dtype)dtype_traits::af_type), + Array::Array(const Array& parent, const dim4 &dims, const dim_t &offset_, const dim4 &stride) : + info(parent.getDevId(), dims, offset_, stride, (af_dtype)dtype_traits::af_type), data(parent.getData()), data_dims(parent.getDataDims()), node(), - offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), ready(true), owner(false) { } @@ -88,27 +87,27 @@ namespace opencl template Array::Array(Param &tmp) : - info(getActiveDeviceId(), af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3]), - af::dim4(0, 0, 0, 0), - af::dim4(tmp.info.strides[0], tmp.info.strides[1], - tmp.info.strides[2], tmp.info.strides[3]), - (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), + af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3]), + 0, + af::dim4(tmp.info.strides[0], tmp.info.strides[1], + tmp.info.strides[2], tmp.info.strides[3]), + (af_dtype)dtype_traits::af_type), data(tmp.data, bufferFree), data_dims(af::dim4(tmp.info.dims[0], tmp.info.dims[1], tmp.info.dims[2], tmp.info.dims[3])), - node(), offset(0), ready(true), owner(true) + node(), ready(true), owner(true) { } template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, const T * const in_data, bool is_device) : - info(getActiveDeviceId(), dims, af::dim4(offset_), strides, (af_dtype)dtype_traits::af_type), + info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), data(is_device ? (new cl::Buffer((cl_mem)in_data)) : (bufferAlloc(info.elements() * sizeof(T))), bufferFree), data_dims(dims), node(), - offset(offset_), ready(true), owner(true) { @@ -204,18 +203,23 @@ namespace opencl dim4 dDims = parent.getDataDims(); dim4 pDims = parent.dims(); - dim4 dims = toDims (index, pDims); - dim4 offset = toOffset(index, dDims); - dim4 stride = toStride (index, dDims); + dim4 dims = toDims (index, pDims); + dim4 strides = toStride (index, dDims); - Array out = Array(parent, dims, offset, stride); + // Find total offsets after indexing + dim4 offsets = toOffset(index, pDims); + dim4 parent_strides = parent.strides(); + dim_t offset = parent.getOffset(); + for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i]; + + Array out = Array(parent, dims, offset, strides); if (!copy) return out; - if (stride[0] != 1 || - stride[1] < 0 || - stride[2] < 0 || - stride[3] < 0) { + if (strides[0] != 1 || + strides[1] < 0 || + strides[2] < 0 || + strides[3] < 0) { out = copyArray(out); } diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index d1a4d973c2..207e303e52 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -90,13 +90,12 @@ namespace opencl af::dim4 data_dims; JIT::Node_ptr node; - dim_t offset; bool ready; bool owner; Array(af::dim4 dims); - Array(const Array& parnt, const dim4 &dims, const dim4 &offset, const dim4 &stride); + Array(const Array& parnt, const dim4 &dims, const dim_t &offset, const dim4 &stride); Array(Param &tmp); explicit Array(af::dim4 dims, JIT::Node_ptr n); explicit Array(af::dim4 dims, const T * const in_data); @@ -117,7 +116,6 @@ namespace opencl RET_TYPE NAME() const { return info.NAME(); } INFO_FUNC(const af_dtype& ,getType) - INFO_FUNC(const af::dim4& ,offsets) INFO_FUNC(const af::dim4& ,strides) INFO_FUNC(size_t ,elements) INFO_FUNC(size_t ,ndims) @@ -187,7 +185,7 @@ namespace opencl const dim_t getOffset() const { - return offset; + return info.getOffset(); } Buffer_ptr getData() const From 2d595034d8eff1d69df86912fa5f72356cbed834 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 11 Feb 2016 13:41:44 -0500 Subject: [PATCH 248/288] BUGFIX: Fixed issues with offsets in moddims after using indexing --- src/api/c/moddims.cpp | 1 + src/backend/cpu/Array.hpp | 5 +++++ src/backend/cuda/Array.hpp | 5 +++++ src/backend/opencl/Array.hpp | 5 +++++ 4 files changed, 16 insertions(+) diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp index 132086a6ef..b8f1fafa6c 100644 --- a/src/api/c/moddims.cpp +++ b/src/api/c/moddims.cpp @@ -32,6 +32,7 @@ Array modDims(const Array& in, const af::dim4 &newDims) } Out.modDims(newDims); + Out.setDataDims(newDims); return Out; } diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index eb17852c27..0c6e701981 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -173,6 +173,11 @@ namespace cpu return isOwner() ? info.dims() : data_dims; } + void setDataDims(const dim4 &new_dims) + { + data_dims = new_dims; + } + T* device() { getQueue().sync(); diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index c6cdd2121d..03bd8b3a29 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -168,6 +168,11 @@ namespace cuda return isOwner() ? dims() : data_dims; } + void setDataDims(const dim4 &new_dims) + { + data_dims = new_dims; + } + T* device() { if (!isOwner() || data.use_count() > 1) { diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index 207e303e52..f2a217e001 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -200,6 +200,11 @@ namespace opencl return isOwner() ? dims() : data_dims; } + void setDataDims(const dim4 &new_dims) + { + data_dims = new_dims; + } + operator Param() const { KParam info = {{dims()[0], dims()[1], dims()[2], dims()[3]}, From b260cc8ffb2261b5a00cb36d6531fa4b43b747ea Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 11 Feb 2016 13:42:19 -0500 Subject: [PATCH 249/288] Fixes to internal functions - Was using incorrect number of elements for the total - Fixed copy because right now isOwner() does not mean isLinear() - Potentially improves performance when isLinear() is not isOwner() --- src/api/c/internal.cpp | 4 ++++ src/backend/ArrayInfo.hpp | 1 + src/backend/cpu/Array.cpp | 4 ++-- src/backend/cpu/copy.cpp | 2 +- src/backend/cuda/Array.cpp | 4 ++-- src/backend/cuda/copy.cu | 2 +- src/backend/opencl/Array.cpp | 4 ++-- src/backend/opencl/copy.cpp | 2 +- 8 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/api/c/internal.cpp b/src/api/c/internal.cpp index d5f449e7ac..cad4a466ce 100644 --- a/src/api/c/internal.cpp +++ b/src/api/c/internal.cpp @@ -40,6 +40,10 @@ af_err af_create_array_with_strides(af_array *arr, dim4 dims(ndims, dims_); dim4 strides(ndims, strides_); + for (int i = ndims; i < 4; i++) { + strides[i] = strides[i - 1] * dims[i - 1]; + } + bool isdev = location == afDevice; af_array res; diff --git a/src/backend/ArrayInfo.hpp b/src/backend/ArrayInfo.hpp index 38e5ea61ab..0983f06f28 100644 --- a/src/backend/ArrayInfo.hpp +++ b/src/backend/ArrayInfo.hpp @@ -82,6 +82,7 @@ class ArrayInfo size_t elements() const { return dim_size.elements(); } size_t ndims() const { return dim_size.ndims(); } const af::dim4& dims() const { return dim_size; } + size_t total() const { return offset + dim_strides[3] * dim_size[3]; } int getDevId() const; diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 1b6098df41..3edca877cd 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -71,14 +71,14 @@ template Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, const T * const in_data, bool is_device) : info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), - data(is_device ? (T*)in_data : memAlloc(info.elements()), memFree), + data(is_device ? (T*)in_data : memAlloc(info.total()), memFree), data_dims(dims), node(), ready(true), owner(true) { if (!is_device) { - std::copy(in_data, in_data + dims.elements(), data.get()); + std::copy(in_data, in_data + info.total(), data.get()); } } diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index f844d959a2..0da304b3ca 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -30,7 +30,7 @@ void copyData(T *to, const Array &from) { from.eval(); getQueue().sync(); - if(from.isOwner()) { + if(from.isLinear()) { // FIXME: Check for errors / exceptions memcpy(to, from.get(), from.elements()*sizeof(T)); } else { diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 370e8eca31..c1cf8102eb 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -90,7 +90,7 @@ namespace cuda Array::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, const T * const in_data, bool is_device) : info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), - data(is_device ? (T*)in_data : memAlloc(info.elements()), memFree), + data(is_device ? (T*)in_data : memAlloc(info.total()), memFree), data_dims(dims), node(), ready(true), @@ -98,7 +98,7 @@ namespace cuda { if (!is_device) { cudaStream_t stream = getStream(getActiveDeviceId()); - CUDA_CHECK(cudaMemcpyAsync(data.get(), in_data, info.elements() * sizeof(T), + CUDA_CHECK(cudaMemcpyAsync(data.get(), in_data, info.total() * sizeof(T), cudaMemcpyHostToDevice, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); } diff --git a/src/backend/cuda/copy.cu b/src/backend/cuda/copy.cu index df435d245c..35e5c83178 100644 --- a/src/backend/cuda/copy.cu +++ b/src/backend/cuda/copy.cu @@ -28,7 +28,7 @@ namespace cuda Array out = A; const T *ptr = NULL; - if (A.isOwner() || // No offsets, No strides + if (A.isLinear() || // No offsets, No strides A.ndims() == 1 // Simple offset, no strides. ) { diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index fb3e63beaf..bd576ca88a 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -105,14 +105,14 @@ namespace opencl info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits::af_type), data(is_device ? (new cl::Buffer((cl_mem)in_data)) : - (bufferAlloc(info.elements() * sizeof(T))), bufferFree), + (bufferAlloc(info.total() * sizeof(T))), bufferFree), data_dims(dims), node(), ready(true), owner(true) { if (!is_device) { - getQueue().enqueueWriteBuffer(*data.get(), CL_TRUE, 0, sizeof(T) * info.elements(), in_data); + getQueue().enqueueWriteBuffer(*data.get(), CL_TRUE, 0, sizeof(T) * info.total(), in_data); } } diff --git a/src/backend/opencl/copy.cpp b/src/backend/opencl/copy.cpp index 39cbf4b59d..e1716f1632 100644 --- a/src/backend/opencl/copy.cpp +++ b/src/backend/opencl/copy.cpp @@ -29,7 +29,7 @@ namespace opencl cl::Buffer buf; Array out = A; - if (A.isOwner() || // No offsets, No strides + if (A.isLinear() || // No offsets, No strides A.ndims() == 1 // Simple offset, no strides. ) { buf = *A.get(); From 6e9eacb152b119d2fd1e037bed4514b50abccb0a Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 11 Feb 2016 15:57:28 -0500 Subject: [PATCH 250/288] DOCS: Adding documentation for internal functions --- docs/details/internal.dox | 29 ++++++++++ include/af/internal.h | 119 ++++++++++++++++++++++++++++++++++++++ include/arrayfire.h | 9 +++ 3 files changed, 157 insertions(+) create mode 100644 docs/details/internal.dox diff --git a/docs/details/internal.dox b/docs/details/internal.dox new file mode 100644 index 0000000000..879f8196df --- /dev/null +++ b/docs/details/internal.dox @@ -0,0 +1,29 @@ +/** +\addtogroup internal_func +@{ + +\defgroup internal_func_create createArray + +Create an array with specified strides and offset. + + +\defgroup internal_func_strides getStrides + +Get strides of underlying data. + + +\defgroup internal_func_offset getOffset + +Get Offset of the underlying data. + + +\defgroup internal_func_linear isLinear + +Check if all elements in array are contiguous. + +\defgroup internal_func_owner isOwner + +Check if underlying data is owned by the current array. + +@} +*/ diff --git a/include/af/internal.h b/include/af/internal.h index fdd0158e2e..6ba42b3028 100644 --- a/include/af/internal.h +++ b/include/af/internal.h @@ -16,20 +16,78 @@ namespace af { class array; +#if AF_API_VERSION >= 33 + /** + \param[in] data is the raw data pointer. + \param[in] offset specifies the number of elements to skip. + \param[in] dims specifies the dimensions for the region of interest. + \param[in] strides specifies the distance between each element of a given dimension. + \param[in] ty specifies the data type of \p data. + \param[in] location specifies if the data is on host or the device. + + \note: If \p location is `afHost`, a memory copy is performed. + + \returns an af::array() with specified offset, dimensions and strides. + + \ingroup internal_func_create + */ AFAPI array createArray(const void *data, const dim_t offset, const dim4 dims, const dim4 strides, const af::dtype ty, const af::source location); +#endif +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns af::dim4() containing distance between consecutive elements in each dimension. + + \ingroup internal_func_strides + */ AFAPI dim4 getStrides(const array &in); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns offset from the starting location of data pointer specified in number of elements. + \ingroup internal_func_offset + */ AFAPI dim_t getOffset(const array &in); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns Returns the raw pointer location to the array. + \note This pointer may be shared with other arrays. Use this function with caution. + + \ingroup internal_func_rawptr + */ AFAPI void *getRawPtr(const array &in); +#endif +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns a boolean specifying if all elements in the array are contiguous. + + \ingroup internal_func_linear + */ AFAPI bool isLinear(const array &in); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] in An multi dimensional array. + \returns a boolean specifying if the array owns the raw pointer. It is false if it is a sub array. + \ingroup internal_func_owner + */ AFAPI bool isOwner(const array &in); +#endif } #endif @@ -38,6 +96,21 @@ extern "C" { #endif +#if AF_API_VERSION >= 33 + /** + \param[out] arr an af_array with specified offset, dimensions and strides. + \param[in] data is the raw data pointer. + \param[in] offset specifies the number of elements to skip. + \param[in] ndims specifies the number of array dimensions. + \param[in] dims specifies the dimensions for the region of interest. + \param[in] strides specifies the distance between each element of a given dimension. + \param[in] ty specifies the data type of \p data. + \param[in] location specifies if the data is on host or the device. + + \note If \p location is `afHost`, a memory copy is performed. + + \ingroup internal_func_create + */ AFAPI af_err af_create_array_with_strides(af_array *arr, const void *data, const dim_t offset, @@ -46,16 +119,62 @@ extern "C" const dim_t *const strides, const af_dtype ty, const af_source location); +#endif +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] s0 distance between each consecutive element along first dimension. + \param[out] s1 distance between each consecutive element along second dimension. + \param[out] s2 distance between each consecutive element along third dimension. + \param[out] s3 distance between each consecutive element along fourth dimension. + + \ingroup internal_func_strides + */ AFAPI af_err af_get_strides(dim_t *s0, dim_t *s1, dim_t *s2, dim_t *s3, const af_array arr); +#endif +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] offset: Offset from the starting location of data pointer specified in number of elements. distance between each consecutive element along first dimension. + + \ingroup internal_func_offset + */ AFAPI af_err af_get_offset(dim_t *offset, const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] ptr the raw pointer location to the array. + \note This pointer may be shared with other arrays. Use this function with caution. + + \ingroup internal_func_rawptr + */ AFAPI af_err af_get_raw_ptr(void **ptr, const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] result: a boolean specifying if all elements in the array are contiguous. + \ingroup internal_func_linear + */ AFAPI af_err af_is_linear(bool *result, const af_array arr); +#endif + +#if AF_API_VERSION >= 33 + /** + \param[in] arr An multi dimensional array. + \param[out] result: a boolean specifying if the array owns the raw pointer. It is false if it is a sub array. + \ingroup internal_func_owner + */ AFAPI af_err af_is_owner(bool *result, const af_array arr); +#endif #ifdef __cplusplus } diff --git a/include/arrayfire.h b/include/arrayfire.h index 73b417b3ad..60df3176d1 100644 --- a/include/arrayfire.h +++ b/include/arrayfire.h @@ -209,6 +209,15 @@ @} + @defgroup internal_func Functions to work with internal array layout + @{ + + Functions to work with arrayfire's internal data structure. + + Note: The behavior of these functions is not promised to be consistent across versions. + + @} + @defgroup external Interface Functions @{ From f6d02366edf0709b3af3ebbbaca6de43407975a0 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 11 Feb 2016 16:00:07 -0500 Subject: [PATCH 251/288] Renaming createArray to be createStridedArray --- docs/details/internal.dox | 2 +- include/af/internal.h | 24 ++++++++++++------------ src/api/c/internal.cpp | 16 ++++++++-------- src/api/cpp/internal.cpp | 14 +++++++------- src/api/unified/internal.cpp | 16 ++++++++-------- 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/docs/details/internal.dox b/docs/details/internal.dox index 879f8196df..5ac06422ca 100644 --- a/docs/details/internal.dox +++ b/docs/details/internal.dox @@ -2,7 +2,7 @@ \addtogroup internal_func @{ -\defgroup internal_func_create createArray +\defgroup internal_func_create createStridedArray Create an array with specified strides and offset. diff --git a/include/af/internal.h b/include/af/internal.h index 6ba42b3028..53002929c3 100644 --- a/include/af/internal.h +++ b/include/af/internal.h @@ -31,10 +31,10 @@ namespace af \ingroup internal_func_create */ - AFAPI array createArray(const void *data, const dim_t offset, - const dim4 dims, const dim4 strides, - const af::dtype ty, - const af::source location); + AFAPI array createStridedArray(const void *data, const dim_t offset, + const dim4 dims, const dim4 strides, + const af::dtype ty, + const af::source location); #endif #if AF_API_VERSION >= 33 @@ -111,14 +111,14 @@ extern "C" \ingroup internal_func_create */ - AFAPI af_err af_create_array_with_strides(af_array *arr, - const void *data, - const dim_t offset, - const unsigned ndims, - const dim_t *const dims, - const dim_t *const strides, - const af_dtype ty, - const af_source location); + AFAPI af_err af_create_strided_array(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims, + const dim_t *const strides, + const af_dtype ty, + const af_source location); #endif #if AF_API_VERSION >= 33 diff --git a/src/api/c/internal.cpp b/src/api/c/internal.cpp index cad4a466ce..ac7a374f95 100644 --- a/src/api/c/internal.cpp +++ b/src/api/c/internal.cpp @@ -20,14 +20,14 @@ using namespace detail; -af_err af_create_array_with_strides(af_array *arr, - const void *data, - const dim_t offset, - const unsigned ndims, - const dim_t *const dims_, - const dim_t *const strides_, - const af_dtype ty, - const af_source location) +af_err af_create_strided_array(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims_, + const dim_t *const strides_, + const af_dtype ty, + const af_source location) { try { diff --git a/src/api/cpp/internal.cpp b/src/api/cpp/internal.cpp index f26f9f8bb4..bdce6e155c 100644 --- a/src/api/cpp/internal.cpp +++ b/src/api/cpp/internal.cpp @@ -13,15 +13,15 @@ namespace af { - array createArray(const void *data, const dim_t offset, - const dim4 dims, const dim4 strides, - const af::dtype ty, - const af::source location) + array createStridedArray(const void *data, const dim_t offset, + const dim4 dims, const dim4 strides, + const af::dtype ty, + const af::source location) { af_array res; - AF_THROW(af_create_array_with_strides(&res, data, offset, - dims.ndims(), dims.get(), strides.get(), - ty, location)); + AF_THROW(af_create_strided_array(&res, data, offset, + dims.ndims(), dims.get(), strides.get(), + ty, location)); return array(res); } diff --git a/src/api/unified/internal.cpp b/src/api/unified/internal.cpp index 7c223e741c..b9ac0ac277 100644 --- a/src/api/unified/internal.cpp +++ b/src/api/unified/internal.cpp @@ -11,14 +11,14 @@ #include "symbol_manager.hpp" -af_err af_create_array_with_strides(af_array *arr, - const void *data, - const dim_t offset, - const unsigned ndims, - const dim_t *const dims_, - const dim_t *const strides_, - const af_dtype ty, - const af_source location) +af_err af_create_strided_array(af_array *arr, + const void *data, + const dim_t offset, + const unsigned ndims, + const dim_t *const dims_, + const dim_t *const strides_, + const af_dtype ty, + const af_source location) { return CALL(arr, data, offset, ndims, dims_, strides_, ty, location); } From f728c03f5fec4d446b35a3bee05bae544cfe50cd Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 11 Feb 2016 16:05:57 -0500 Subject: [PATCH 252/288] Adding additional constraints when creating strided array --- src/api/c/internal.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/api/c/internal.cpp b/src/api/c/internal.cpp index ac7a374f95..8afdefa30f 100644 --- a/src/api/c/internal.cpp +++ b/src/api/c/internal.cpp @@ -37,6 +37,10 @@ af_err af_create_strided_array(af_array *arr, ARG_ASSERT(5, strides_ != NULL); ARG_ASSERT(5, strides_[0] == 1); + for (int i = 1; i < (int)ndims; i++) { + ARG_ASSERT(5, strides_[i] > 0); + } + dim4 dims(ndims, dims_); dim4 strides(ndims, strides_); From 7bc56a78cb31bff14bac6b5aa9d08c181c2be65c Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 11 Feb 2016 16:45:20 -0500 Subject: [PATCH 253/288] Adding functions to get raw pointer out of Array --- src/api/c/internal.cpp | 24 ++++++++++++------------ src/backend/cpu/Array.hpp | 7 +++++++ src/backend/cuda/Array.hpp | 7 +++++++ src/backend/opencl/Array.hpp | 7 +++++++ 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/api/c/internal.cpp b/src/api/c/internal.cpp index 8afdefa30f..47c62c6478 100644 --- a/src/api/c/internal.cpp +++ b/src/api/c/internal.cpp @@ -109,18 +109,18 @@ af_err af_get_raw_ptr(void **ptr, const af_array arr) af_dtype ty = getInfo(arr).getType(); switch (ty) { - case f32: res = (void *)getArray(arr).get(); break; - case f64: res = (void *)getArray(arr).get(); break; - case c32: res = (void *)getArray(arr).get(); break; - case c64: res = (void *)getArray(arr).get(); break; - case u32: res = (void *)getArray(arr).get(); break; - case s32: res = (void *)getArray(arr).get(); break; - case u64: res = (void *)getArray(arr).get(); break; - case s64: res = (void *)getArray(arr).get(); break; - case u16: res = (void *)getArray(arr).get(); break; - case s16: res = (void *)getArray(arr).get(); break; - case b8 : res = (void *)getArray(arr).get(); break; - case u8 : res = (void *)getArray(arr).get(); break; + case f32: res = (void *)getRawPtr(getArray(arr)); break; + case f64: res = (void *)getRawPtr(getArray(arr)); break; + case c32: res = (void *)getRawPtr(getArray(arr)); break; + case c64: res = (void *)getRawPtr(getArray(arr)); break; + case u32: res = (void *)getRawPtr(getArray(arr)); break; + case s32: res = (void *)getRawPtr(getArray(arr)); break; + case u64: res = (void *)getRawPtr(getArray(arr)); break; + case s64: res = (void *)getRawPtr(getArray(arr)); break; + case u16: res = (void *)getRawPtr(getArray(arr)); break; + case s16: res = (void *)getRawPtr(getArray(arr)); break; + case b8 : res = (void *)getRawPtr(getArray(arr)); break; + case u8 : res = (void *)getRawPtr(getArray(arr)); break; default: TYPE_ERROR(6, ty); } diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 0c6e701981..2a3afcf617 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -88,6 +88,12 @@ namespace cpu return (void *)ptr; } + template + void *getRawPtr(const Array& arr) + { + return (void *)(arr.get(false)); + } + // Array Array Implementation template class Array @@ -227,6 +233,7 @@ namespace cpu friend void destroyArray(Array *arr); friend void *getDevicePtr(const Array& arr); + friend void *getRawPtr(const Array& arr); }; } diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index 03bd8b3a29..7678754bc3 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -90,6 +90,12 @@ namespace cuda return (void *)ptr; } + template + void *getRawPtr(const Array& arr) + { + return (void *)(arr.get(false)); + } + template class Array { @@ -239,6 +245,7 @@ namespace cuda friend void destroyArray(Array *arr); friend void *getDevicePtr(const Array& arr); + friend void *getRawPtr(const Array& arr); }; } diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index f2a217e001..8c5bda90de 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -82,6 +82,12 @@ namespace opencl return (void *)((*buf)()); } + template + void *getRawPtr(const Array& arr) + { + return (void *)(arr.get()); + } + template class Array { @@ -261,6 +267,7 @@ namespace opencl friend void destroyArray(Array *arr); friend void *getDevicePtr(const Array& arr); + friend void *getRawPtr(const Array& arr); }; } From 86ff134cac58bd104f10aebb51cd5bf4f897a160 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 11 Feb 2016 16:45:42 -0500 Subject: [PATCH 254/288] TEST: Adding tests for internal functions --- test/internal.cpp | 124 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 test/internal.cpp diff --git a/test/internal.cpp b/test/internal.cpp new file mode 100644 index 0000000000..75fa54fdb9 --- /dev/null +++ b/test/internal.cpp @@ -0,0 +1,124 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +TEST(Internal, CreateStrided) +{ + float ha[] = {1, + 101, 102, 103, 104, 105, + 201, 202, 203, 204, 205, + 301, 302, 303, 304, 305, + 401, 402, 403, 404, 405, + + 1010, 1020, 1030, 1040, 1050, + 2010, 2020, 2030, 2040, 2050, + 3010, 3020, 3030, 3040, 3050, + 4010, 4020, 4030, 4040, 4050}; + + dim_t offset = 1; + unsigned ndims = 3; + dim_t dims[] = {3, 3, 2}; + dim_t strides[] = {1, 5, 20}; + af::array a = createStridedArray((void *)ha, + offset, + af::dim4(ndims, dims), + af::dim4(ndims, strides), + f32, + afHost); + + af::dim4 astrides = getStrides(a); + af::dim4 adims = a.dims(); + + ASSERT_EQ(offset, getOffset(a)); + for (int i = 0; i < (int)ndims; i++) { + ASSERT_EQ(strides[i], astrides[i]); + ASSERT_EQ(dims[i], adims[i]); + } + + std::vector va(a.elements()); + a.host(&va[0]); + + int o = offset; + for (int k = 0; k < dims[2]; k++) { + for (int j = 0; j < dims[1]; j++) { + for (int i = 0; i < dims[0]; i++) { + ASSERT_EQ(va[i + j * dims[0] + k * dims[0] * dims[1]], + ha[i * strides[0] + j * strides[1] + k * strides[2] + o]) + << "at (" + << i << "," + << j << "," + << k << ")"; + } + } + } +} + +TEST(Internal, CheckInfo) +{ + int xdim = 10; + int ydim = 8; + + int xoff = 1; + int yoff = 2; + + int xnum = 5; + int ynum = 3; + + af::array a = af::randu(10, 8); + + af::array b = a(af::seq(xoff, xoff + xnum - 1), + af::seq(yoff, yoff + ynum - 1)); + + af::dim4 strides = getStrides(b); + af::dim4 dims = b.dims(); + + dim_t offset = xoff + yoff * xdim; + + ASSERT_EQ(dims[0], xnum); + ASSERT_EQ(dims[1], ynum); + ASSERT_EQ(isOwner(a), true); + ASSERT_EQ(isOwner(b), false); + + ASSERT_EQ(getOffset(b), offset); + ASSERT_EQ(strides[0], 1); + ASSERT_EQ(strides[1], xdim); + ASSERT_EQ(strides[2], xdim * ydim); + ASSERT_EQ(getRawPtr(a), getRawPtr(b)); +} + +TEST(Internal, Linear) +{ + af::array c; + { + af::array a = af::randu(10, 8); + + // b is just pointing to same underlying data + // b is an owner; + af::array b = a; + ASSERT_EQ(isOwner(b), true); + + // C is considered sub array + // C will not be an owner + c = a(af::span); + ASSERT_EQ(isOwner(c), false); + } + + // Even though a and b are out of scope, c is still not an owner + { + ASSERT_EQ(isOwner(c), false); + } +} From 5be6cd78a869ad8fb3a147fb3ec793ca019391f8 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Fri, 12 Feb 2016 17:29:26 -0500 Subject: [PATCH 255/288] BUGFIX: Fixed bug in CUDA and OpenCL when re-using same JIT nodes. --- src/backend/cuda/JIT/BinaryNode.hpp | 6 +----- src/backend/cuda/JIT/BufferNode.hpp | 7 +------ src/backend/cuda/JIT/Node.hpp | 19 ++++++++++++++++++- src/backend/cuda/JIT/ScalarNode.hpp | 7 +------ src/backend/cuda/JIT/UnaryNode.hpp | 6 +----- src/backend/opencl/JIT/BinaryNode.hpp | 8 ++++---- src/backend/opencl/JIT/BufferNode.hpp | 9 +-------- src/backend/opencl/JIT/Node.hpp | 18 +++++++++++++++++- src/backend/opencl/JIT/ScalarNode.hpp | 11 ++--------- src/backend/opencl/JIT/UnaryNode.hpp | 7 +++---- 10 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/backend/cuda/JIT/BinaryNode.hpp b/src/backend/cuda/JIT/BinaryNode.hpp index 2a2abb0610..f916d85576 100644 --- a/src/backend/cuda/JIT/BinaryNode.hpp +++ b/src/backend/cuda/JIT/BinaryNode.hpp @@ -126,11 +126,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_set_arg = false; + resetCommonFlags(); m_lhs->resetFlags(); m_rhs->resetFlags(); } diff --git a/src/backend/cuda/JIT/BufferNode.hpp b/src/backend/cuda/JIT/BufferNode.hpp index efe32f8b72..342e1ed0b7 100644 --- a/src/backend/cuda/JIT/BufferNode.hpp +++ b/src/backend/cuda/JIT/BufferNode.hpp @@ -178,12 +178,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_gen_name = false; - m_set_arg = false; + resetCommonFlags(); } void setArgs(std::vector &args, bool is_linear) diff --git a/src/backend/cuda/JIT/Node.hpp b/src/backend/cuda/JIT/Node.hpp index e30a1cf63b..00fed9fda7 100644 --- a/src/backend/cuda/JIT/Node.hpp +++ b/src/backend/cuda/JIT/Node.hpp @@ -37,6 +37,19 @@ namespace JIT bool m_set_arg; bool m_gen_name; + protected: + + void resetCommonFlags() + { + m_set_id = false; + m_gen_func = false; + m_gen_param = false; + m_gen_offset = false; + m_set_arg = false; + m_gen_name = false; + } + + public: Node(const char *type_str, const char *name_str) @@ -62,7 +75,11 @@ namespace JIT virtual void setArgs(std::vector &args, bool is_linear) { m_set_arg = true; } virtual bool isLinear(dim_t dims[4]) { return true; } - virtual void resetFlags() {} + virtual void resetFlags() + { + resetCommonFlags(); + } + virtual void getInfo(unsigned &len, unsigned &buf_count, unsigned &bytes) { len = 0; diff --git a/src/backend/cuda/JIT/ScalarNode.hpp b/src/backend/cuda/JIT/ScalarNode.hpp index 288af4dcdb..34f316d34b 100644 --- a/src/backend/cuda/JIT/ScalarNode.hpp +++ b/src/backend/cuda/JIT/ScalarNode.hpp @@ -87,12 +87,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_gen_name = false; - m_set_arg = false; + resetCommonFlags(); } void setArgs(std::vector &args, bool is_linear) diff --git a/src/backend/cuda/JIT/UnaryNode.hpp b/src/backend/cuda/JIT/UnaryNode.hpp index caa573104b..94ee96ece7 100644 --- a/src/backend/cuda/JIT/UnaryNode.hpp +++ b/src/backend/cuda/JIT/UnaryNode.hpp @@ -118,11 +118,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_set_arg = false; + resetCommonFlags(); m_child->resetFlags(); } diff --git a/src/backend/opencl/JIT/BinaryNode.hpp b/src/backend/opencl/JIT/BinaryNode.hpp index f087760b87..b1f6d112b7 100644 --- a/src/backend/opencl/JIT/BinaryNode.hpp +++ b/src/backend/opencl/JIT/BinaryNode.hpp @@ -51,6 +51,9 @@ namespace JIT int setArgs(cl::Kernel &ker, int id) { + if (m_set_arg) return id; + m_set_arg = true; + id = m_lhs->setArgs(ker, id); id = m_rhs->setArgs(ker, id); return id; @@ -120,10 +123,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; + resetCommonFlags(); m_lhs->resetFlags(); m_rhs->resetFlags(); } diff --git a/src/backend/opencl/JIT/BufferNode.hpp b/src/backend/opencl/JIT/BufferNode.hpp index 71723b99df..9306d59ef5 100644 --- a/src/backend/opencl/JIT/BufferNode.hpp +++ b/src/backend/opencl/JIT/BufferNode.hpp @@ -24,7 +24,6 @@ namespace JIT const std::shared_ptr m_data; const Param m_param; const unsigned m_bytes; - bool m_set_arg; bool m_linear; public: @@ -39,7 +38,6 @@ namespace JIT m_data(data), m_param(param), m_bytes(bytes), - m_set_arg(false), m_linear(is_linear) {} @@ -140,12 +138,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_gen_name = false; - m_set_arg = false; + resetCommonFlags(); } }; diff --git a/src/backend/opencl/JIT/Node.hpp b/src/backend/opencl/JIT/Node.hpp index fedf7fb9bd..fc34c09c19 100644 --- a/src/backend/opencl/JIT/Node.hpp +++ b/src/backend/opencl/JIT/Node.hpp @@ -32,8 +32,20 @@ namespace JIT bool m_gen_func; bool m_gen_param; bool m_gen_offset; + bool m_set_arg; bool m_gen_name; + protected: + void resetCommonFlags() + { + m_set_id = false; + m_gen_func = false; + m_gen_param = false; + m_gen_offset = false; + m_set_arg = false; + m_gen_name = false; + } + public: Node(const char *type_str, const char *name_str) @@ -44,6 +56,7 @@ namespace JIT m_gen_func(false), m_gen_param(false), m_gen_offset(false), + m_set_arg(false), m_gen_name(false) {} @@ -64,7 +77,10 @@ namespace JIT } - virtual void resetFlags() {} + virtual void resetFlags() + { + resetCommonFlags(); + } virtual bool isLinear(dim_t dims[4]) { return true; } diff --git a/src/backend/opencl/JIT/ScalarNode.hpp b/src/backend/opencl/JIT/ScalarNode.hpp index 9eaa544134..0bba7a2fc9 100644 --- a/src/backend/opencl/JIT/ScalarNode.hpp +++ b/src/backend/opencl/JIT/ScalarNode.hpp @@ -24,14 +24,12 @@ namespace JIT { private: const T m_val; - bool m_set_arg; public: ScalarNode(T val) : Node(dtype_traits::getName(), shortname(false)), - m_val(val), - m_set_arg(false) + m_val(val) { } @@ -101,12 +99,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; - m_gen_name = false; - m_set_arg = false; + resetCommonFlags(); } }; diff --git a/src/backend/opencl/JIT/UnaryNode.hpp b/src/backend/opencl/JIT/UnaryNode.hpp index 78fda23e92..e1f32ded8f 100644 --- a/src/backend/opencl/JIT/UnaryNode.hpp +++ b/src/backend/opencl/JIT/UnaryNode.hpp @@ -49,6 +49,8 @@ namespace JIT int setArgs(cl::Kernel &ker, int id) { + if (m_set_arg) return id; + m_set_arg = true; return m_child->setArgs(ker, id); } @@ -108,10 +110,7 @@ namespace JIT void resetFlags() { - m_set_id = false; - m_gen_func = false; - m_gen_param = false; - m_gen_offset = false; + resetCommonFlags(); m_child->resetFlags(); } }; From 3faa83dc14c657e5ebd6de4d60d5d8a67a8fa138 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Fri, 12 Feb 2016 18:42:06 -0500 Subject: [PATCH 256/288] Adding tests for to check for resetting in JIT --- test/jit.cpp | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/test/jit.cpp b/test/jit.cpp index 3c2308d5eb..a20b0f4b19 100644 --- a/test/jit.cpp +++ b/test/jit.cpp @@ -65,3 +65,53 @@ TEST(JIT, CPP_JIT_HASH) delete[] hF2; } } + +TEST(JIT, CPP_JIT_Reset_Binary) +{ + using af::array; + + af::array a = af::constant(2, 5,5); + af::array b = af::constant(1, 5,5); + af::array c = a + b; + af::array d = a - b; + af::array e = c * d; + e.eval(); + af::array f = c - d; + f.eval(); + af::array g = d - c; + g.eval(); + + std::vector hf(f.elements()); + std::vector hg(g.elements()); + f.host(&hf[0]); + g.host(&hg[0]); + + for (int i = 0; i < (int)f.elements(); i++) { + ASSERT_EQ(hf[i], -hg[i]); + } +} + +TEST(JIT, CPP_JIT_Reset_Unary) +{ + using af::array; + + af::array a = af::constant(2, 5,5); + af::array b = af::constant(1, 5,5); + af::array c = af::sin(a); + af::array d = af::cos(b); + af::array e = c * d; + e.eval(); + af::array f = c - d; + f.eval(); + af::array g = d - c; + g.eval(); + + std::vector hf(f.elements()); + std::vector hg(g.elements()); + f.host(&hf[0]); + g.host(&hg[0]); + + for (int i = 0; i < (int)f.elements(); i++) { + ASSERT_EQ(hf[i], -hg[i]); + } +} From 199ea82b15314a3be3e28675185395b972a13ff0 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sun, 14 Feb 2016 21:46:48 -0500 Subject: [PATCH 257/288] Moving af_get_version to version.cpp --- src/api/c/device.cpp | 9 --------- src/api/c/version.cpp | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 304d0c753b..937b0a66c5 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -95,15 +95,6 @@ af_err af_info_string(char **str, const bool verbose) return AF_SUCCESS; } -af_err af_get_version(int *major, int *minor, int *patch) -{ - *major = AF_VERSION_MAJOR; - *minor = AF_VERSION_MINOR; - *patch = AF_VERSION_PATCH; - - return AF_SUCCESS; -} - af_err af_device_info(char* d_name, char* d_platform, char *d_toolkit, char* d_compute) { try { diff --git a/src/api/c/version.cpp b/src/api/c/version.cpp index 4eb7883a41..91d24cb823 100644 --- a/src/api/c/version.cpp +++ b/src/api/c/version.cpp @@ -10,6 +10,15 @@ #include #include +af_err af_get_version(int *major, int *minor, int *patch) +{ + *major = AF_VERSION_MAJOR; + *minor = AF_VERSION_MINOR; + *patch = AF_VERSION_PATCH; + + return AF_SUCCESS; +} + const char *af_get_revision() { return AF_REVISION; From 11aa9339fc60a8888b76e140ce933e773386fab1 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sun, 14 Feb 2016 21:47:48 -0500 Subject: [PATCH 258/288] Fixing af_get_last_error for unified backend The way it was implemented, it was getting the last error generated from unified backend. This change gets the last error generated from a particular backend instead. --- src/api/c/err_common.cpp | 61 +++------------------- src/api/c/err_common.hpp | 4 ++ src/api/c/error.cpp | 65 ++++++++++++++++++++++++ src/api/c/index.cpp | 68 +++++++++++++++++++++++++ src/api/c/util.cpp | 81 ------------------------------ src/api/unified/CMakeLists.txt | 13 +++-- src/api/unified/error.cpp | 26 ++++++++++ src/api/unified/index.cpp | 34 +++++++++++++ src/api/unified/symbol_manager.hpp | 8 +++ src/api/unified/util.cpp | 5 -- 10 files changed, 218 insertions(+), 147 deletions(-) create mode 100644 src/api/c/error.cpp delete mode 100644 src/api/c/util.cpp create mode 100644 src/api/unified/error.cpp diff --git a/src/api/c/err_common.cpp b/src/api/c/err_common.cpp index 495967a891..e95ece6d4b 100644 --- a/src/api/c/err_common.cpp +++ b/src/api/c/err_common.cpp @@ -150,9 +150,6 @@ int DimensionError::getArgIndex() const return argIndex; } -static const int MAX_ERR_SIZE = 1024; -static std::string global_err_string; - void print_error(const string &msg) { @@ -161,57 +158,7 @@ print_error(const string &msg) if(perr != "0") fprintf(stderr, "%s\n", msg.c_str()); } - global_err_string = msg; -} - -void af_get_last_error(char **str, dim_t *len) -{ - dim_t slen = std::min(MAX_ERR_SIZE, (int)global_err_string.size()); - - if (len && slen == 0) { - *len = 0; - *str = NULL; - return; - } - - af_alloc_host((void**)str, sizeof(char) * (slen + 1)); - global_err_string.copy(*str, slen); - - (*str)[slen] = '\0'; - global_err_string = std::string(""); - - if(len) *len = slen; -} - -const char *af_err_to_string(const af_err err) -{ - switch (err) { - case AF_SUCCESS: return "Success"; - case AF_ERR_NO_MEM: return "Device out of memory"; - case AF_ERR_DRIVER: return "Driver not available or incompatible"; - case AF_ERR_RUNTIME: return "Runtime error "; - case AF_ERR_INVALID_ARRAY: return "Invalid array"; - case AF_ERR_ARG: return "Invalid input argument"; - case AF_ERR_SIZE: return "Invalid input size"; - case AF_ERR_TYPE: return "Function does not support this data type"; - case AF_ERR_DIFF_TYPE: return "Input types are not the same"; - case AF_ERR_BATCH: return "Invalid batch configuration"; - case AF_ERR_NOT_SUPPORTED: return "Function not supported"; - case AF_ERR_NOT_CONFIGURED: return "Function not configured to build"; - case AF_ERR_NONFREE: return "Function unavailable. " - "ArrayFire compiled without Non-Free algorithms support"; - case AF_ERR_NO_DBL: return "Double precision not supported for this device"; - case AF_ERR_NO_GFX: return "Graphics functionality unavailable. " - "ArrayFire compiled without Graphics support"; - case AF_ERR_LOAD_LIB: return "Failed to load dynamic library. " - "See http://www.arrayfire.com/docs/unifiedbackend.htm " - "for instructions to set up environment for Unified backend"; - case AF_ERR_LOAD_SYM: return "Failed to load symbol"; - case AF_ERR_ARR_BKND_MISMATCH: return "There was a mismatch between an array and the current backend"; - case AF_ERR_INTERNAL: return "Internal error"; - case AF_ERR_UNKNOWN: - default: return "Unknown error"; - } + get_global_error_string() = msg; } af_err processException() @@ -271,3 +218,9 @@ af_err processException() return err; } + +std::string& get_global_error_string() +{ + static std::string global_error_string = std::string(""); + return global_error_string; +} diff --git a/src/api/c/err_common.hpp b/src/api/c/err_common.hpp index c8eb90a7f6..60ef64276b 100644 --- a/src/api/c/err_common.hpp +++ b/src/api/c/err_common.hpp @@ -203,3 +203,7 @@ void print_error(const std::string &msg); __AF_FILENAME__, __LINE__, \ "\n", __err); \ } while(0) + + +static const int MAX_ERR_SIZE = 1024; +std::string& get_global_error_string(); diff --git a/src/api/c/error.cpp b/src/api/c/error.cpp new file mode 100644 index 0000000000..4a7d4b29b9 --- /dev/null +++ b/src/api/c/error.cpp @@ -0,0 +1,65 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include + +void af_get_last_error(char **str, dim_t *len) +{ + std::string &global_error_string = get_global_error_string(); + dim_t slen = std::min(MAX_ERR_SIZE, (int)global_error_string.size()); + + if (len && slen == 0) { + *len = 0; + *str = NULL; + return; + } + + af_alloc_host((void**)str, sizeof(char) * (slen + 1)); + global_error_string.copy(*str, slen); + + (*str)[slen] = '\0'; + global_error_string = std::string(""); + + if(len) *len = slen; +} + +const char *af_err_to_string(const af_err err) +{ + switch (err) { + case AF_SUCCESS: return "Success"; + case AF_ERR_NO_MEM: return "Device out of memory"; + case AF_ERR_DRIVER: return "Driver not available or incompatible"; + case AF_ERR_RUNTIME: return "Runtime error "; + case AF_ERR_INVALID_ARRAY: return "Invalid array"; + case AF_ERR_ARG: return "Invalid input argument"; + case AF_ERR_SIZE: return "Invalid input size"; + case AF_ERR_TYPE: return "Function does not support this data type"; + case AF_ERR_DIFF_TYPE: return "Input types are not the same"; + case AF_ERR_BATCH: return "Invalid batch configuration"; + case AF_ERR_NOT_SUPPORTED: return "Function not supported"; + case AF_ERR_NOT_CONFIGURED: return "Function not configured to build"; + case AF_ERR_NONFREE: return "Function unavailable. " + "ArrayFire compiled without Non-Free algorithms support"; + case AF_ERR_NO_DBL: return "Double precision not supported for this device"; + case AF_ERR_NO_GFX: return "Graphics functionality unavailable. " + "ArrayFire compiled without Graphics support"; + case AF_ERR_LOAD_LIB: return "Failed to load dynamic library. " + "See http://www.arrayfire.com/docs/unifiedbackend.htm " + "for instructions to set up environment for Unified backend"; + case AF_ERR_LOAD_SYM: return "Failed to load symbol"; + case AF_ERR_ARR_BKND_MISMATCH: return "There was a mismatch between an array and the current backend"; + case AF_ERR_INTERNAL: return "Internal error"; + case AF_ERR_UNKNOWN: + default: return "Unknown error"; + } +} diff --git a/src/api/c/index.cpp b/src/api/c/index.cpp index f5a214f8e5..4a20ca2b34 100644 --- a/src/api/c/index.cpp +++ b/src/api/c/index.cpp @@ -230,3 +230,71 @@ af_err af_index_gen(af_array *out, const af_array in, const dim_t ndims, const a return AF_SUCCESS; } + +af_seq af_make_seq(double begin, double end, double step) +{ + af_seq seq = {begin, end, step}; + return seq; +} + +af_err af_create_indexers(af_index_t** indexers) +{ + try { + af_index_t* out = new af_index_t[4]; + std::swap(*indexers, out); + } + CATCHALL; + return AF_SUCCESS; +} + +af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim) +{ + try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(1, (idx!=NULL)); + ARG_ASSERT(2, (dim>=0 && dim<=3)); + indexer[dim].idx.arr = idx; + indexer[dim].isBatch = false; + indexer[dim].isSeq = false; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch) +{ + try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(1, (idx!=NULL)); + ARG_ASSERT(2, (dim>=0 && dim<=3)); + indexer[dim].idx.seq = *idx; + indexer[dim].isBatch = is_batch; + indexer[dim].isSeq = true; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_set_seq_param_indexer(af_index_t* indexer, + const double begin, const double end, const double step, + const dim_t dim, const bool is_batch) +{ + try { + ARG_ASSERT(0, (indexer!=NULL)); + ARG_ASSERT(4, (dim>=0 && dim<=3)); + indexer[dim].idx.seq = af_make_seq(begin, end, step); + indexer[dim].isBatch = is_batch; + indexer[dim].isSeq = true; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_release_indexers(af_index_t* indexers) +{ + try { + delete[] indexers; + } + CATCHALL; + return AF_SUCCESS; +} diff --git a/src/api/c/util.cpp b/src/api/c/util.cpp deleted file mode 100644 index 9b16fe98df..0000000000 --- a/src/api/c/util.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include -// The following should be included using double quotes -// to enable it's use in unified wrapper -#include "err_common.hpp" - -af_seq af_make_seq(double begin, double end, double step) -{ - af_seq seq = {begin, end, step}; - return seq; -} - -af_err af_create_indexers(af_index_t** indexers) -{ - try { - af_index_t* out = new af_index_t[4]; - std::swap(*indexers, out); - } - CATCHALL; - return AF_SUCCESS; -} - -af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim) -{ - try { - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(1, (idx!=NULL)); - ARG_ASSERT(2, (dim>=0 && dim<=3)); - indexer[dim].idx.arr = idx; - indexer[dim].isBatch = false; - indexer[dim].isSeq = false; - } - CATCHALL - return AF_SUCCESS; -} - -af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch) -{ - try { - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(1, (idx!=NULL)); - ARG_ASSERT(2, (dim>=0 && dim<=3)); - indexer[dim].idx.seq = *idx; - indexer[dim].isBatch = is_batch; - indexer[dim].isSeq = true; - } - CATCHALL - return AF_SUCCESS; -} - -af_err af_set_seq_param_indexer(af_index_t* indexer, - const double begin, const double end, const double step, - const dim_t dim, const bool is_batch) -{ - try { - ARG_ASSERT(0, (indexer!=NULL)); - ARG_ASSERT(4, (dim>=0 && dim<=3)); - indexer[dim].idx.seq = af_make_seq(begin, end, step); - indexer[dim].isBatch = is_batch; - indexer[dim].isSeq = true; - } - CATCHALL - return AF_SUCCESS; -} - -af_err af_release_indexers(af_index_t* indexers) -{ - try { - delete[] indexers; - } - CATCHALL; - return AF_SUCCESS; -} diff --git a/src/api/unified/CMakeLists.txt b/src/api/unified/CMakeLists.txt index 6ed95d088c..c44e43b5fc 100644 --- a/src/api/unified/CMakeLists.txt +++ b/src/api/unified/CMakeLists.txt @@ -15,13 +15,12 @@ FILE(GLOB cpp_sources SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources}) FILE(GLOB common_sources - "../c/util.cpp" - "../c/err_common.cpp" - "../c/type_util.cpp" - "../c/version.cpp" - "../../backend/dim4.cpp" - "../../backend/util.cpp" - ) + "../c/version.cpp" + "../c/err_common.cpp" + "../c/type_util.cpp" + "../../backend/dim4.cpp" + "../../backend/util.cpp" + ) SOURCE_GROUP(common FILES ${common_sources}) diff --git a/src/api/unified/error.cpp b/src/api/unified/error.cpp new file mode 100644 index 0000000000..00b07396a1 --- /dev/null +++ b/src/api/unified/error.cpp @@ -0,0 +1,26 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include "symbol_manager.hpp" + +void af_get_last_error(char **str, dim_t *len) +{ + typedef void(*af_func)(char **, dim_t *); + af_func func = (af_func)LOAD_SYMBOL(); + return func(str, len); +} + +const char *af_err_to_string(const af_err err) +{ + typedef char *(*af_func)(af_err); + af_func func = (af_func)LOAD_SYMBOL(); + return func(err); +} diff --git a/src/api/unified/index.cpp b/src/api/unified/index.cpp index 0927dd8b71..4df5926d62 100644 --- a/src/api/unified/index.cpp +++ b/src/api/unified/index.cpp @@ -52,3 +52,37 @@ af_err af_assign_gen( af_array *out, CHECK_ARRAYS(lhs, rhs); return CALL(out, lhs, ndims, indices, rhs); } + +af_seq af_make_seq(double begin, double end, double step) +{ + af_seq seq = {begin, end, step}; + return seq; +} + +af_err af_create_indexers(af_index_t** indexers) +{ + return CALL(indexers); +} + +af_err af_set_array_indexer(af_index_t* indexer, const af_array idx, const dim_t dim) +{ + CHECK_ARRAYS(idx); + return CALL(indexer, idx, dim); +} + +af_err af_set_seq_indexer(af_index_t* indexer, const af_seq* idx, const dim_t dim, const bool is_batch) +{ + return CALL(indexer, idx, dim, is_batch); +} + +af_err af_set_seq_param_indexer(af_index_t* indexer, + const double begin, const double end, const double step, + const dim_t dim, const bool is_batch) +{ + return CALL(indexer, begin, end, step, dim, is_batch); +} + +af_err af_release_indexers(af_index_t* indexers) +{ + return CALL(indexers); +} diff --git a/src/api/unified/symbol_manager.hpp b/src/api/unified/symbol_manager.hpp index 048d1843c7..1530102022 100644 --- a/src/api/unified/symbol_manager.hpp +++ b/src/api/unified/symbol_manager.hpp @@ -59,6 +59,8 @@ class AFSymbolManager { return funcHandle(args...); } + LibHandle getHandle() { return activeHandle; } + protected: AFSymbolManager(); @@ -108,3 +110,9 @@ bool checkArrays(af_backend activeBackend, T a, Args... arg) #define CALL(...) unified::AFSymbolManager::getInstance().call(__func__, __VA_ARGS__) #define CALL_NO_PARAMS() unified::AFSymbolManager::getInstance().call(__func__) #endif + +#if defined(OS_WIN) +#define LOAD_SYMBOL() GetProcAddress(unified::AFSymbolManager::getInstance().getHandle(), __FUNCTION__) +#else +#define LOAD_SYMBOL() dlsym(unified::AFSymbolManager::getInstance().getHandle(), __func__) +#endif diff --git a/src/api/unified/util.cpp b/src/api/unified/util.cpp index 155c4f81b9..178ac87ad8 100644 --- a/src/api/unified/util.cpp +++ b/src/api/unified/util.cpp @@ -56,8 +56,3 @@ af_err af_example_function(af_array* out, const af_array in, const af_someenum_t CHECK_ARRAYS(in); return CALL(out, in, param); } - -af_err af_get_version(int *major, int *minor, int *patch) -{ - return CALL(major, minor, patch); -} From fd87af4a91e302efdebc71e43655c1bb16ccbea7 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 15 Feb 2016 19:32:09 -0500 Subject: [PATCH 259/288] Add better error messages coming out of unified api --- src/api/c/error.cpp | 4 +--- src/api/unified/array.cpp | 1 + src/api/unified/error.cpp | 31 +++++++++++++++++++++++++++--- src/api/unified/symbol_manager.cpp | 7 ++++--- src/api/unified/symbol_manager.hpp | 29 ++++++++++++++++++++-------- 5 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/api/c/error.cpp b/src/api/c/error.cpp index 4a7d4b29b9..521ca9bef5 100644 --- a/src/api/c/error.cpp +++ b/src/api/c/error.cpp @@ -53,9 +53,7 @@ const char *af_err_to_string(const af_err err) case AF_ERR_NO_DBL: return "Double precision not supported for this device"; case AF_ERR_NO_GFX: return "Graphics functionality unavailable. " "ArrayFire compiled without Graphics support"; - case AF_ERR_LOAD_LIB: return "Failed to load dynamic library. " - "See http://www.arrayfire.com/docs/unifiedbackend.htm " - "for instructions to set up environment for Unified backend"; + case AF_ERR_LOAD_LIB: return "Failed to load dynamic library. "; case AF_ERR_LOAD_SYM: return "Failed to load symbol"; case AF_ERR_ARR_BKND_MISMATCH: return "There was a mismatch between an array and the current backend"; case AF_ERR_INTERNAL: return "Internal error"; diff --git a/src/api/unified/array.cpp b/src/api/unified/array.cpp index 59158ca195..7d4f9486f0 100644 --- a/src/api/unified/array.cpp +++ b/src/api/unified/array.cpp @@ -8,6 +8,7 @@ ********************************************************/ #include +#include #include "symbol_manager.hpp" af_err af_create_array(af_array *arr, const void * const data, const unsigned ndims, const dim_t * const dims, const af_dtype type) diff --git a/src/api/unified/error.cpp b/src/api/unified/error.cpp index 00b07396a1..0224876ec3 100644 --- a/src/api/unified/error.cpp +++ b/src/api/unified/error.cpp @@ -9,13 +9,38 @@ #include #include +#include +#include #include "symbol_manager.hpp" void af_get_last_error(char **str, dim_t *len) { - typedef void(*af_func)(char **, dim_t *); - af_func func = (af_func)LOAD_SYMBOL(); - return func(str, len); + // Set error message from unified backend + std::string &global_error_string = get_global_error_string(); + dim_t slen = std::min(MAX_ERR_SIZE, (int)global_error_string.size()); + + // If this is true, the error is coming from the unified backend. + if (slen != 0) { + + if (len && slen == 0) { + *len = 0; + *str = NULL; + return; + } + + af_alloc_host((void**)str, sizeof(char) * (slen + 1)); + global_error_string.copy(*str, slen); + + (*str)[slen] = '\0'; + global_error_string = std::string(""); + + if (len) *len = slen; + } else { + // If false, the error is coming from active backend. + typedef void(*af_func)(char **, dim_t *); + af_func func = (af_func)LOAD_SYMBOL(); + func(str, len); + } } const char *af_err_to_string(const af_err err) diff --git a/src/api/unified/symbol_manager.cpp b/src/api/unified/symbol_manager.cpp index 94fef2d541..96cec0b6ac 100644 --- a/src/api/unified/symbol_manager.cpp +++ b/src/api/unified/symbol_manager.cpp @@ -203,8 +203,9 @@ af_err AFSymbolManager::setBackend(af::Backend bknd) activeHandle = defaultHandle; activeBackend = defaultBackend; return AF_SUCCESS; - } else - return AF_ERR_LOAD_LIB; + } else { + UNIFIED_ERROR_LOAD_LIB(); + } } int idx = bknd >> 1; // Convert 1, 2, 4 -> 0, 1, 2 if(bkndHandles[idx]) { @@ -212,7 +213,7 @@ af_err AFSymbolManager::setBackend(af::Backend bknd) activeBackend = bknd; return AF_SUCCESS; } else { - return AF_ERR_LOAD_LIB; + UNIFIED_ERROR_LOAD_LIB(); } } diff --git a/src/api/unified/symbol_manager.hpp b/src/api/unified/symbol_manager.hpp index 1530102022..658ac74b64 100644 --- a/src/api/unified/symbol_manager.hpp +++ b/src/api/unified/symbol_manager.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #if defined(OS_WIN) #include @@ -27,6 +28,13 @@ namespace unified const int NUM_BACKENDS = 3; const int NUM_ENV_VARS = 2; +#define UNIFIED_ERROR_LOAD_LIB() \ + AF_RETURN_ERROR("Failed to load dynamic library. " \ + "See http://www.arrayfire.com/docs/unifiedbackend.htm " \ + "for instructions to set up environment for Unified backend.", \ + AF_ERR_LOAD_LIB) + + class AFSymbolManager { public: static AFSymbolManager& getInstance(); @@ -43,8 +51,9 @@ class AFSymbolManager { template af_err call(const char* symbolName, CalleeArgs... args) { - if (!activeHandle) - return AF_ERR_LOAD_LIB; + if (!activeHandle) { + UNIFIED_ERROR_LOAD_LIB(); + } typedef af_err(*af_func)(CalleeArgs...); af_func funcHandle; #if defined(OS_WIN) @@ -53,7 +62,10 @@ class AFSymbolManager { funcHandle = (af_func)dlsym(activeHandle, symbolName); #endif if (!funcHandle) { - return AF_ERR_LOAD_SYM; + std::string str = "Failed to load symbol: "; + str += symbolName; + AF_RETURN_ERROR(str.c_str(), + AF_ERR_LOAD_SYM); } return funcHandle(args...); @@ -97,11 +109,12 @@ bool checkArrays(af_backend activeBackend, T a, Args... arg) // Macro to check af_array as inputs. The arguments to this macro should be // only input af_arrays. Not outputs or other types. -#define CHECK_ARRAYS(...) do { \ - af_backend backendId = unified::AFSymbolManager::getInstance().getActiveBackend(); \ - if(!unified::checkArrays(backendId, __VA_ARGS__)) \ - return AF_ERR_ARR_BKND_MISMATCH; \ -} while(0) +#define CHECK_ARRAYS(...) do { \ + af_backend backendId = unified::AFSymbolManager::getInstance().getActiveBackend(); \ + if(!unified::checkArrays(backendId, __VA_ARGS__)) \ + AF_RETURN_ERROR("Input array does not belong to current backend", \ + AF_ERR_ARR_BKND_MISMATCH); \ + } while(0) #if defined(OS_WIN) #define CALL(...) unified::AFSymbolManager::getInstance().call(__FUNCTION__, __VA_ARGS__) From 52158efc3a6ff3c14343f033e2fec7d9c0e24e55 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 16 Feb 2016 13:26:16 -0500 Subject: [PATCH 260/288] Properly handle af_release_array when using a different backend --- src/api/unified/array.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/api/unified/array.cpp b/src/api/unified/array.cpp index 7d4f9486f0..809c9d4e6b 100644 --- a/src/api/unified/array.cpp +++ b/src/api/unified/array.cpp @@ -41,8 +41,16 @@ af_err af_get_data_ptr(void *data, const af_array arr) af_err af_release_array(af_array arr) { - CHECK_ARRAYS(arr); - return CALL(arr); + af_backend curr = unified::AFSymbolManager::getInstance().getActiveBackend(); + af_backend other = curr; + + af_err err = af_get_backend_id(&other, arr); + if (err != AF_SUCCESS) return err; + + unified::AFSymbolManager::getInstance().setBackend(other); + err = CALL(arr); + unified::AFSymbolManager::getInstance().setBackend(curr); + return err; } af_err af_retain_array(af_array *out, const af_array in) From 4c045b1188cb5ee952a88a8618cb0766c10e9224 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 16 Feb 2016 14:41:16 -0500 Subject: [PATCH 261/288] Set minimum CMake version to 2.8.12 (previously 2.8) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8bdf93cd52..0def888f6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -CMAKE_MINIMUM_REQUIRED(VERSION 2.8) +CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12) PROJECT(ARRAYFIRE) SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON) From 3c06fa081f781d5d590fc6f9f00a76ea09b1d80a Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 16 Feb 2016 14:59:18 -0500 Subject: [PATCH 262/288] Force offload OSX LAPACK on unified memory devices --- src/backend/opencl/blas.cpp | 2 +- src/backend/opencl/platform.cpp | 17 ++++++++++++++--- src/backend/opencl/platform.hpp | 2 +- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/backend/opencl/blas.cpp b/src/backend/opencl/blas.cpp index 365e6e5680..77531154e5 100644 --- a/src/backend/opencl/blas.cpp +++ b/src/backend/opencl/blas.cpp @@ -121,7 +121,7 @@ Array matmul(const Array &lhs, const Array &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { #if defined(WITH_OPENCL_LINEAR_ALGEBRA) - if(OpenCLCPUOffload()) { + if(OpenCLCPUOffload(false)) { // Do not force offload gemm on OSX Intel devices return cpu::matmul(lhs, rhs, optLhs, optRhs); } #endif diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index 6855e79f66..c2c13c7ae3 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -514,11 +514,22 @@ bool isHostUnifiedMemory(const cl::Device &device) return device.getInfo(); } -bool OpenCLCPUOffload() +bool OpenCLCPUOffload(bool forceOffloadOSX) { - static const bool sync = getEnvVar("AF_OPENCL_CPU_OFFLOAD") == "1"; + static const bool offloadEnv = getEnvVar("AF_OPENCL_CPU_OFFLOAD") == "1"; bool offload = false; - if(sync) offload = isHostUnifiedMemory(getDevice()); + if(offloadEnv) offload = isHostUnifiedMemory(getDevice()); +#if OS_MAC + // FORCED OFFLOAD FOR LAPACK FUNCTIONS ON OSX UNIFIED MEMORY DEVICES + // + // On OSX Unified Memory devices (Intel), always offload LAPACK but not GEMM + // irrespective of the AF_OPENCL_CPU_OFFLOAD value + // From GEMM, OpenCLCPUOffload(false) is called which will render the + // variable inconsequential to the returned result. + // + // Issue https://github.com/arrayfire/arrayfire/issues/662 + offload = offload || forceOffloadOSX; +#endif return offload; } diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 4c745e0c91..095fdf9ae7 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -114,7 +114,7 @@ cl_device_type getDeviceType(); bool isHostUnifiedMemory(const cl::Device &device); -bool OpenCLCPUOffload(); +bool OpenCLCPUOffload(bool forceOffloadOSX = true); bool isGLSharingSupported(); From 3c385b3909d7588eea30e16984f2c3c9069105bc Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 16 Feb 2016 16:05:29 -0500 Subject: [PATCH 263/288] Add BUILD_* Options for examples and tests when building standalone --- examples/CMakeLists.txt | 29 +++++++++++++-------------- test/CMakeLists.txt | 43 ++++++++++++++++++++++++----------------- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4710d1b739..be0f6407be 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -61,13 +61,17 @@ ENDMACRO() # and TARGET_LINK_LIBRARIES(... ${ARRAYFIRE_LIBRARIES}) are needed MACRO(BUILD_ALL FILES BACKEND_NAME BACKEND_LIBRARIES OTHER_LIBRARIES) - FOREACH(FILE ${FILES}) - GET_FILENAME_COMPONENT(EXAMPLE ${FILE} NAME_WE) - GET_FILENAME_COMPONENT(FULL_DIR_NAME ${FILE} PATH) - GET_FILENAME_COMPONENT(DIR_NAME ${FULL_DIR_NAME} NAME) + STRING(TOUPPER ${BACKEND_NAME} BACKEND_NAME_UPPER) + MESSAGE(STATUS "EXAMPLES: ${BACKEND_NAME_UPPER} backend is ${BUILD_${BACKEND_NAME_UPPER}}.") + IF(${BUILD_${BACKEND_NAME_UPPER}}) + FOREACH(FILE ${FILES}) + GET_FILENAME_COMPONENT(EXAMPLE ${FILE} NAME_WE) + GET_FILENAME_COMPONENT(FULL_DIR_NAME ${FILE} PATH) + GET_FILENAME_COMPONENT(DIR_NAME ${FULL_DIR_NAME} NAME) - BUILD_EXAMPLE(${EXAMPLE} ${FILE} ${BACKEND_NAME} "${BACKEND_LIBRARIES}" "${OTHER_LIBRARIES}" ${DIR_NAME}) - ENDFOREACH() + BUILD_EXAMPLE(${EXAMPLE} ${FILE} ${BACKEND_NAME} "${BACKEND_LIBRARIES}" "${OTHER_LIBRARIES}" ${DIR_NAME}) + ENDFOREACH() + ENDIF() ENDMACRO() # Collect the source @@ -76,10 +80,9 @@ ADD_DEFINITIONS("-DASSETS_DIR=\"${ASSETS_DIR}\"") # Next we build each example using every backend. IF(${ArrayFire_CPU_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "EXAMPLES: CPU backend is ON.") + OPTION(BUILD_CPU "Build ArrayFire Examples for CPU backend" ON) BUILD_ALL("${FILES}" cpu ${ArrayFire_CPU_LIBRARIES} "") ELSEIF(TARGET afcpu) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "EXAMPLES: CPU backend is ON.") BUILD_ALL("${FILES}" cpu afcpu "") ELSE() MESSAGE(STATUS "EXAMPLES: CPU backend is OFF. afcpu was not found.") @@ -87,10 +90,9 @@ ENDIF() # Next we build each example using every backend. IF(${ArrayFire_Unified_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "EXAMPLES: UNIFIED backend is ON.") + OPTION(BUILD_UNIFIED "Build ArrayFire Examples for Unified backend" ON) BUILD_ALL("${FILES}" unified ${ArrayFire_Unified_LIBRARIES} "${CMAKE_DL_LIBS}") ELSEIF(TARGET af) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "EXAMPLES: UNIFIED backend is ON.") BUILD_ALL("${FILES}" unified af "${CMAKE_DL_LIBS}") ELSE() MESSAGE(STATUS "EXAMPLES: UNIFIED backend is OFF. af was not found.") @@ -104,10 +106,10 @@ IF (${CUDA_FOUND}) PATHS ${CUDA_TOOLKIT_ROOT_DIR} DOC "CUDA NVVM Library" ) - MESSAGE(STATUS "EXAMPLES: CUDA backend is ON.") + MARK_AS_ADVANCED(CUDA_NVVM_LIBRARY) + OPTION(BUILD_CUDA "Build ArrayFire Examples for CUDA backend" ON) BUILD_ALL("${FILES}" cuda ${ArrayFire_CUDA_LIBRARIES} "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") ELSEIF(TARGET afcuda) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "EXAMPLES: CUDA backend is ON.") BUILD_ALL("${FILES}" cuda afcuda "") ELSE() MESSAGE(STATUS "EXAMPLES: CUDA backend is OFF. afcuda was not found") @@ -118,10 +120,9 @@ ENDIF() IF (${OpenCL_FOUND}) IF(${ArrayFire_OpenCL_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "EXAMPLES: OpenCL backend is ON.") + OPTION(BUILD_OPENCL "Build ArrayFire Examples for OpenCL backend" ON) BUILD_ALL("${FILES}" opencl ${ArrayFire_OpenCL_LIBRARIES} "${OpenCL_LIBRARIES}") ELSEIF(TARGET afopencl) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "EXAMPLES: OpenCL backend is ON.") BUILD_ALL("${FILES}" opencl afopencl "${OpenCL_LIBRARIES}") ELSE() MESSAGE(STATUS "EXAMPLES: OpenCL backend is OFF. afopencl was not found") diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bea93d554e..5db23714d3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -110,6 +110,14 @@ MACRO(CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS) ENDMACRO(CREATE_TESTS) +MACRO(CHECK_AND_CREATE_TESTS BACKEND AFLIBNAME GTEST_LIBS OTHER_LIBS) + STRING(TOUPPER ${BACKEND} BACKEND_NAME_UPPER) + MESSAGE(STATUS "TESTS: ${BACKEND_NAME_UPPER} backend is ${BUILD_${BACKEND_NAME_UPPER}}.") + IF(${BUILD_${BACKEND_NAME_UPPER}}) + CREATE_TESTS(${BACKEND} ${AFLIBNAME} "${GTEST_LIBS}" "${OTHER_LIBS}") + ENDIF() +ENDMACRO(CHECK_AND_CREATE_TESTS) + FIND_PACKAGE(Threads REQUIRED) IF(CMAKE_USE_PTHREADS_INIT AND NOT "${APPLE}") SET(THREAD_LIB_FLAG "-pthread") @@ -170,11 +178,10 @@ LIST(SORT UNIFIED_FILES) # Tests execute in alphabetical order # Next we build each example using every backend. IF(${ArrayFire_CPU_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "TESTS: CPU backend is ON.") - CREATE_TESTS(cpu ${ArrayFire_CPU_LIBRARIES} "${GTEST_LIBRARIES}" "") + OPTION(BUILD_CPU "Build ArrayFire Tests for CPU backend" ON) + CHECK_AND_CREATE_TESTS(cpu ${ArrayFire_CPU_LIBRARIES} "${GTEST_LIBRARIES}" "") ELSEIF(TARGET afcpu) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "TESTS: CPU backend is ON.") - CREATE_TESTS(cpu afcpu "${GTEST_LIBRARIES}" "") + CHECK_AND_CREATE_TESTS(cpu afcpu "${GTEST_LIBRARIES}" "") ELSE() MESSAGE(STATUS "TESTS: CPU backend is OFF. afcpu was not found.") ENDIF() @@ -188,10 +195,11 @@ IF (${CUDA_FOUND}) PATHS ${CUDA_TOOLKIT_ROOT_DIR} DOC "CUDA NVVM Library" ) - MESSAGE(STATUS "TESTS: CUDA backend is ON.") + MARK_AS_ADVANCED(CUDA_NVVM_LIBRARY) # If OSX && CLANG && CUDA < 7 IF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) - CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") + OPTION(BUILD_CUDA "Build ArrayFire Tests for CUDA backend" ON) + CHECK_AND_CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") FOREACH(FILE ${FILES}) GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE) @@ -202,15 +210,15 @@ IF (${CUDA_FOUND}) # ELSE OSX && CLANG && CUDA < 7 ELSE("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) - CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") + OPTION(BUILD_CUDA "Build ArrayFire Tests for CUDA backend" ON) + CHECK_AND_CREATE_TESTS(cuda ${ArrayFire_CUDA_LIBRARIES} "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") ENDIF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) ELSEIF(TARGET afcuda) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "TESTS: CUDA backend is ON.") # If OSX && CLANG && CUDA < 7 IF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) - CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") + CHECK_AND_CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES_STDLIB}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") FOREACH(FILE ${FILES}) GET_FILENAME_COMPONENT(FNAME ${FILE} NAME_WE) @@ -221,7 +229,7 @@ IF (${CUDA_FOUND}) # ELSE OSX && CLANG && CUDA < 7 ELSE("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) - CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") + CHECK_AND_CREATE_TESTS(cuda afcuda "${GTEST_LIBRARIES}" "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES};${CUDA_cusolver_LIBRARY};${CUDA_CUFFT_LIBRARIES};${CUDA_NVVM_LIBRARY};${CUDA_CUDA_LIBRARY}") ENDIF("${APPLE}" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CUDA_VERSION_MAJOR} VERSION_LESS 7) ELSE() @@ -235,11 +243,11 @@ ENDIF() IF (${OpenCL_FOUND}) INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIRS}) IF(${ArrayFire_OpenCL_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "TESTS: OpenCL backend is ON.") - CREATE_TESTS(opencl ${ArrayFire_OpenCL_LIBRARIES} "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") + OPTION(BUILD_OPENCL "Build ArrayFire Tests for OpenCL backend" ON) + MESSAGE(${OpenCL_LIBRARIES}) + CHECK_AND_CREATE_TESTS(opencl ${ArrayFire_OpenCL_LIBRARIES} "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") ELSEIF(TARGET afopencl) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "TESTS: OpenCL backend is ON.") - CREATE_TESTS(opencl afopencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") + CHECK_AND_CREATE_TESTS(opencl afopencl "${GTEST_LIBRARIES}" "${OpenCL_LIBRARIES}") ELSE() MESSAGE(STATUS "TESTS: OpenCL backend is OFF. afopencl was not found") ENDIF() @@ -249,11 +257,10 @@ ENDIF() # Unified Backend IF(${ArrayFire_Unified_FOUND}) # variable defined by FIND(ArrayFire ...) - MESSAGE(STATUS "TESTS: UNIFIED backend is ON.") - CREATE_TESTS(unified ${ArrayFire_Unified_LIBRARIES} "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}") + OPTION(BUILD_UNIFIED "Build ArrayFire Tests for Unified backend" ON) + CHECK_AND_CREATE_TESTS(unified ${ArrayFire_Unified_LIBRARIES} "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}") ELSEIF(TARGET af) # variable defined by the ArrayFire build tree - MESSAGE(STATUS "TESTS: UNIFIED backend is ON.") - CREATE_TESTS(unified af "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}") + CHECK_AND_CREATE_TESTS(unified af "${GTEST_LIBRARIES}" "${CMAKE_DL_LIBS}") ELSE() MESSAGE(STATUS "TESTS: UNIFIED backend is OFF. af was not found.") ENDIF() From 9b10c0e7c8ff267582e65700e9efcdfdb986414a Mon Sep 17 00:00:00 2001 From: Youssef Nashed Date: Tue, 16 Feb 2016 14:40:45 -0600 Subject: [PATCH 264/288] Added support for loading 32 bit integer images --- src/api/c/imageio.cpp | 40 ++++++++++++++++++++++++++++++++++------ src/api/c/imageio2.cpp | 26 ++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp index 748ddbc58e..d990b10904 100644 --- a/src/api/c/imageio.cpp +++ b/src/api/c/imageio.cpp @@ -186,7 +186,10 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor) if(fi_bpc != 8 && fi_bpc != 16 && fi_bpc != 32) { AF_ERROR("FreeImage Error: Bits per channel not supported", AF_ERR_NOT_SUPPORTED); } - + + // data type + FREE_IMAGE_TYPE image_type = FreeImage_GetImageType(pBitmap); + // sizes uint fi_w = FreeImage_GetWidth(pBitmap); uint fi_h = FreeImage_GetHeight(pBitmap); @@ -204,21 +207,36 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor) else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else if (fi_color == 1) { if(fi_bpc == 8) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else { //3 channel image if(fi_bpc == 8) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } } else { //output gray irrespective if(fi_color == 1) { //4 channel image @@ -227,14 +245,24 @@ af_err af_load_image(af_array *out, const char* filename, const bool isColor) else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else if (fi_color == 3 || fi_color == 4) { if(fi_bpc == 8) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } } diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp index ff7a4a8d34..44886aac50 100644 --- a/src/api/c/imageio2.cpp +++ b/src/api/c/imageio2.cpp @@ -162,7 +162,10 @@ af_err af_load_image_native(af_array *out, const char* filename) if(fi_bpc != 8 && fi_bpc != 16 && fi_bpc != 32) { AF_ERROR("FreeImage Error: Bits per channel not supported", AF_ERR_NOT_SUPPORTED); } - + + // data type + FREE_IMAGE_TYPE image_type = FreeImage_GetImageType(pBitmap); + // sizes uint fi_w = FreeImage_GetWidth(pBitmap); uint fi_h = FreeImage_GetHeight(pBitmap); @@ -179,21 +182,36 @@ af_err af_load_image_native(af_array *out, const char* filename) else if(fi_bpc == 16) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else if (fi_color == 1) { if(fi_bpc == 8) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } else { //3 channel imag if(fi_bpc == 8) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 16) AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); else if(fi_bpc == 32) - AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); + switch(image_type) { + case FIT_UINT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_INT32: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + case FIT_FLOAT: AF_CHECK((readImage_t)(&rImage, pSrcLine, nSrcPitch, fi_w, fi_h)); break; + default: AF_ERROR("FreeImage Error: Unknown image type", AF_ERR_NOT_SUPPORTED); break; + } } std::swap(*out,rImage); From 0a0f6e3680a21948c677796c89e0dc94d83ccef8 Mon Sep 17 00:00:00 2001 From: Johan Pauwels Date: Sat, 20 Feb 2016 22:29:43 +0100 Subject: [PATCH 265/288] Search for GLEWmx in default paths too Is there a reason the search needs to be limited to those specific locations? CMake now can't find the GLEW I installed under $HOME (even though I added it to CMAKE_SYSTEM_PREFIX_PATH). --- CMakeModules/FindGLEWmx.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeModules/FindGLEWmx.cmake b/CMakeModules/FindGLEWmx.cmake index b90919eb98..a6da72bbf2 100644 --- a/CMakeModules/FindGLEWmx.cmake +++ b/CMakeModules/FindGLEWmx.cmake @@ -55,7 +55,6 @@ ELSE (WIN32) /sw/lib /opt/local/lib ${GLEW_ROOT_DIR}/lib - NO_DEFAULT_PATH DOC "The GLEWmx library") SET(PX ${CMAKE_STATIC_LIBRARY_PREFIX}) @@ -72,7 +71,6 @@ ELSE (WIN32) /sw/lib /opt/local/lib ${GLEW_ROOT_DIR}/lib - NO_DEFAULT_PATH DOC "The GLEWmx library") UNSET(PX) UNSET(SX) From e4facbb73c5c06095514d177e75adff467c7111e Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Sun, 21 Feb 2016 22:21:27 -0500 Subject: [PATCH 266/288] Adding function to check if memory usage is approaching the limits --- src/backend/MemoryManager.cpp | 9 +++++++-- src/backend/MemoryManager.hpp | 2 ++ src/backend/cpu/memory.cpp | 5 +++++ src/backend/cpu/memory.hpp | 1 + src/backend/cuda/memory.cpp | 5 +++++ src/backend/cuda/memory.hpp | 2 ++ src/backend/opencl/memory.cpp | 5 +++++ src/backend/opencl/memory.hpp | 1 + 8 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index b66dfc33e7..0879e98cea 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -153,8 +153,7 @@ void *MemoryManager::alloc(const size_t bytes, bool user_lock) // FIXME: Add better checks for garbage collection // Perhaps look at total memory available as a metric - if (current.lock_bytes >= current.max_bytes || - current.total_buffers >= this->max_buffers) { + if (this->checkMemoryLimit()) { this->garbageCollect(); } @@ -305,4 +304,10 @@ unsigned MemoryManager::getMaxBuffers() return this->max_buffers; } +bool MemoryManager::checkMemoryLimit() +{ + memory_info& current = this->getCurrentMemoryInfo(); + return current.lock_bytes >= current.max_bytes || current.total_buffers >= this->max_buffers; +} + } diff --git a/src/backend/MemoryManager.hpp b/src/backend/MemoryManager.hpp index 015fa6db3d..0db70b572d 100644 --- a/src/backend/MemoryManager.hpp +++ b/src/backend/MemoryManager.hpp @@ -111,6 +111,8 @@ class MemoryManager { } + bool checkMemoryLimit(); + protected: mutex_t memory_mutex; diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 016428a6d9..8837e27da1 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -160,6 +160,11 @@ void pinnedFree(T* ptr) return getMemoryManager().unlock((void *)ptr, false); } +bool checkMemoryLimit() +{ + return getMemoryManager().checkMemoryLimit(); +} + #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ diff --git a/src/backend/cpu/memory.hpp b/src/backend/cpu/memory.hpp index 80ee86ddc8..91116fbcfc 100644 --- a/src/backend/cpu/memory.hpp +++ b/src/backend/cpu/memory.hpp @@ -39,4 +39,5 @@ namespace cpu void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); + bool checkMemoryLimit(); } diff --git a/src/backend/cuda/memory.cpp b/src/backend/cuda/memory.cpp index ff62661601..51eb507320 100644 --- a/src/backend/cuda/memory.cpp +++ b/src/backend/cuda/memory.cpp @@ -229,6 +229,11 @@ void pinnedFree(T* ptr) return getMemoryManagerPinned().unlock((void *)ptr, false); } +bool checkMemoryLimit() +{ + return getMemoryManager().checkMemoryLimit(); +} + #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ diff --git a/src/backend/cuda/memory.hpp b/src/backend/cuda/memory.hpp index 9bf69df9d4..80478c13dc 100644 --- a/src/backend/cuda/memory.hpp +++ b/src/backend/cuda/memory.hpp @@ -39,4 +39,6 @@ namespace cuda void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); + + bool checkMemoryLimit(); } diff --git a/src/backend/opencl/memory.cpp b/src/backend/opencl/memory.cpp index 756d18749e..5df64d6d86 100644 --- a/src/backend/opencl/memory.cpp +++ b/src/backend/opencl/memory.cpp @@ -259,6 +259,11 @@ void pinnedFree(T* ptr) return getMemoryManagerPinned().unlock((void *)ptr, false); } +bool checkMemoryLimit() +{ + return getMemoryManager().checkMemoryLimit(); +} + #define INSTANTIATE(T) \ template T* memAlloc(const size_t &elements); \ template void memFree(T* ptr); \ diff --git a/src/backend/opencl/memory.hpp b/src/backend/opencl/memory.hpp index f4d06a3324..a02d387591 100644 --- a/src/backend/opencl/memory.hpp +++ b/src/backend/opencl/memory.hpp @@ -44,4 +44,5 @@ namespace opencl void setMemStepSize(size_t step_bytes); size_t getMemStepSize(void); + bool checkMemoryLimit(); } From e1abe128ae19b769c1b48855df3306d87afcc847 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 22 Feb 2016 12:27:44 -0500 Subject: [PATCH 267/288] Changes to make sure cpu backend does not enqueue too many functions. This fix synchronizes when the queue hits 25 functions or when the memory used is approaching the device limit. --- src/backend/MemoryManager.cpp | 4 +-- src/backend/cpu/memory.cpp | 21 ++++++++++++-- src/backend/cpu/queue.hpp | 52 ++++++++++++++++++++++------------- 3 files changed, 54 insertions(+), 23 deletions(-) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index 0879e98cea..379c2e2af2 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -257,7 +257,7 @@ void MemoryManager::printInfo(const char *msg, const int device) unit = "MB"; } - std::cout << " | " << std::right << std::setw(14) << kv.first << " " + std::cout << "| " << std::right << std::setw(14) << kv.first << " " << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit << " | " << std::setw(9) << status_mngr << " | " << std::setw(9) << status_user @@ -277,7 +277,7 @@ void MemoryManager::printInfo(const char *msg, const int device) } for (auto &ptr : kv.second) { - std::cout << " | " << std::right << std::setw(14) << ptr << " " + std::cout << "| " << std::right << std::setw(14) << ptr << " " << " | " << std::setw(7) << std::setprecision(4) << size << " " << unit << " | " << std::setw(9) << status_mngr << " | " << std::setw(9) << status_user diff --git a/src/backend/cpu/memory.cpp b/src/backend/cpu/memory.cpp index 8837e27da1..b4b1b450d9 100644 --- a/src/backend/cpu/memory.cpp +++ b/src/backend/cpu/memory.cpp @@ -112,13 +112,30 @@ void printMemInfo(const char *msg, const int device) template T* memAlloc(const size_t &elements) { - return (T *)getMemoryManager().alloc(elements * sizeof(T), false); + T *ptr = nullptr; + + try { + ptr = (T *)getMemoryManager().alloc(elements * sizeof(T), false); + } catch(...) { + getQueue().sync(); + ptr = (T *)getMemoryManager().alloc(elements * sizeof(T), false); + } + return ptr; } void* memAllocUser(const size_t &bytes) { - return getMemoryManager().alloc(bytes, true); + void *ptr = nullptr; + + try { + ptr = getMemoryManager().alloc(bytes, true); + } catch(...) { + getQueue().sync(); + ptr = getMemoryManager().alloc(bytes, true); + } + return ptr; } + template void memFree(T *ptr) { diff --git a/src/backend/cpu/queue.hpp b/src/backend/cpu/queue.hpp index 6d32b85a65..2f32b4d852 100644 --- a/src/backend/cpu/queue.hpp +++ b/src/backend/cpu/queue.hpp @@ -8,6 +8,7 @@ ********************************************************/ #include +#include //FIXME: Is there a better way to check for std::future not being supported ? #if defined(AF_DISABLE_CPU_ASYNC) || (defined(__GNUC__) && (__GCC_ATOMIC_INT_LOCK_FREE < 2 || __GCC_ATOMIC_POINTER_LOCK_FREE < 2)) @@ -48,32 +49,45 @@ typedef async_queue queue_impl; namespace cpu { /// Wraps the async_queue class -class queue { +class queue +{ public: - queue() - : sync_calls( __SYNCHRONOUS_ARCH == 1 || getEnvVar("AF_SYNCHRONOUS_CALLS") == "1") {} - - template - void enqueue(const F func, Args... args) { + queue() + : + count(0), + sync_calls( __SYNCHRONOUS_ARCH == 1 || getEnvVar("AF_SYNCHRONOUS_CALLS") == "1") + {} - if(sync_calls) { func( args... ); } - else { aQueue.enqueue( func, args... ); } + template + void enqueue(const F func, Args... args) + { + count++; + if(sync_calls) { func( args... ); } + else { aQueue.enqueue( func, args... ); } #ifndef NDEBUG - sync(); + sync(); +#else + if (checkMemoryLimit() || count >= 25) { + sync(); + } #endif + } - } - void sync() { - if(!sync_calls) aQueue.sync(); - } + void sync() + { + count = 0; + if(!sync_calls) aQueue.sync(); + } - bool is_worker() const { - return (!sync_calls) ? aQueue.is_worker() : false; - } + bool is_worker() const + { + return (!sync_calls) ? aQueue.is_worker() : false; + } -private: - const bool sync_calls; - queue_impl aQueue; + private: + int count; + const bool sync_calls; + queue_impl aQueue; }; } From e59df758ad1116ff56d369edb9fa889be5528a55 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 22 Feb 2016 12:28:09 -0500 Subject: [PATCH 268/288] Making copyArray from cpu backend asynchronous --- src/backend/cpu/copy.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/cpu/copy.cpp b/src/backend/cpu/copy.cpp index 0da304b3ca..27e80f8afb 100644 --- a/src/backend/cpu/copy.cpp +++ b/src/backend/cpu/copy.cpp @@ -42,8 +42,9 @@ void copyData(T *to, const Array &from) template Array copyArray(const Array &A) { + A.eval(); Array out = createEmptyArray(A.dims()); - copyData(out.get(), A); + getQueue().enqueue(kernel::copy, out, A, scalar(0), 1.0); return out; } From cfe76f376f4f594a3ba340e09bd5177c44e4e72c Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 22 Feb 2016 12:28:48 -0500 Subject: [PATCH 269/288] Adding missing evals in cpu backend --- src/backend/cpu/histogram.cpp | 3 ++- src/backend/cpu/kernel/sift_nonfree.hpp | 1 + src/backend/cpu/lu.cpp | 3 +++ src/backend/cpu/nearest_neighbour.cpp | 2 ++ src/backend/cpu/qr.cpp | 3 +++ src/backend/cpu/scan.cpp | 3 +-- src/backend/cpu/solve.cpp | 3 +++ src/backend/cpu/triangle.cpp | 1 + 8 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp index 6aa60e59e4..3c30402b47 100644 --- a/src/backend/cpu/histogram.cpp +++ b/src/backend/cpu/histogram.cpp @@ -23,7 +23,8 @@ namespace cpu template Array histogram(const Array &in, - const unsigned &nbins, const double &minval, const double &maxval) + const unsigned &nbins, + const double &minval, const double &maxval) { in.eval(); diff --git a/src/backend/cpu/kernel/sift_nonfree.hpp b/src/backend/cpu/kernel/sift_nonfree.hpp index c1c92a97e6..e7ca19175c 100644 --- a/src/backend/cpu/kernel/sift_nonfree.hpp +++ b/src/backend/cpu/kernel/sift_nonfree.hpp @@ -969,6 +969,7 @@ unsigned sift_impl(Array& x, Array& y, Array& score, const bool compute_GLOH) { in.eval(); + getQueue().sync(); af::dim4 idims = in.dims(); const unsigned min_dim = (double_input) ? min(idims[0]*2, idims[1]*2) diff --git a/src/backend/cpu/lu.cpp b/src/backend/cpu/lu.cpp index 265fdfaec5..24ca4acd78 100644 --- a/src/backend/cpu/lu.cpp +++ b/src/backend/cpu/lu.cpp @@ -44,6 +44,9 @@ LU_FUNC(getrf , cdouble, z) template void lu(Array &lower, Array &upper, Array &pivot, const Array &in) { + lower.eval(); + upper.eval(); + pivot.eval(); in.eval(); dim4 iDims = in.dims(); diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp index f1daba7526..17e892f492 100644 --- a/src/backend/cpu/nearest_neighbour.cpp +++ b/src/backend/cpu/nearest_neighbour.cpp @@ -31,6 +31,8 @@ void nearest_neighbour(Array& idx, Array& dist, CPU_NOT_SUPPORTED(); } + idx.eval(); + dist.eval(); query.eval(); train.eval(); diff --git a/src/backend/cpu/qr.cpp b/src/backend/cpu/qr.cpp index 34a39f64b8..f8dbfa2013 100644 --- a/src/backend/cpu/qr.cpp +++ b/src/backend/cpu/qr.cpp @@ -59,6 +59,9 @@ GQR_FUNC(gqr , cdouble, zungqr) template void qr(Array &q, Array &r, Array &t, const Array &in) { + q.eval(); + r.eval(); + t.eval(); in.eval(); dim4 iDims = in.dims(); diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp index 08431f8baa..78de4142c8 100644 --- a/src/backend/cpu/scan.cpp +++ b/src/backend/cpu/scan.cpp @@ -27,8 +27,7 @@ template Array scan(const Array& in, const int dim) { dim4 dims = in.dims(); - Array out = createValueArray(dims, 0); - out.eval(); + Array out = createEmptyArray(dims); in.eval(); switch (in.ndims()) { diff --git a/src/backend/cpu/solve.cpp b/src/backend/cpu/solve.cpp index 48ea4de3c5..367afa3884 100644 --- a/src/backend/cpu/solve.cpp +++ b/src/backend/cpu/solve.cpp @@ -96,6 +96,9 @@ Array solveLU(const Array &A, const Array &pivot, template Array triangleSolve(const Array &A, const Array &b, const af_mat_prop options) { + A.eval(); + b.eval(); + Array B = copyArray(b); int N = B.dims()[0]; int NRHS = B.dims()[1]; diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp index 57f61b1331..eaad1b9f86 100644 --- a/src/backend/cpu/triangle.cpp +++ b/src/backend/cpu/triangle.cpp @@ -21,6 +21,7 @@ namespace cpu template void triangle(Array &out, const Array &in) { + in.eval(); getQueue().enqueue(kernel::triangle, out, in); } From c66da4028452e5adcba2d7e6c3adff37e95b8b33 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 22 Feb 2016 12:45:10 -0500 Subject: [PATCH 270/288] BUGFIX: Fixing array.write for all backends --- src/backend/cpu/Array.cpp | 5 +++-- src/backend/cuda/Array.cpp | 4 ++-- src/backend/opencl/Array.cpp | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/backend/cpu/Array.cpp b/src/backend/cpu/Array.cpp index 3edca877cd..2c296d02d3 100644 --- a/src/backend/cpu/Array.cpp +++ b/src/backend/cpu/Array.cpp @@ -228,8 +228,9 @@ void writeHostDataArray(Array &arr, const T * const data, const size_t bytes) { if(!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } + arr.eval(); memcpy(arr.get(), data, bytes); } @@ -238,7 +239,7 @@ void writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) { if(!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } memcpy(arr.get(), (const T * const)data, bytes); } diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index c1cf8102eb..786574129b 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -256,7 +256,7 @@ namespace cuda writeHostDataArray(Array &arr, const T * const data, const size_t bytes) { if (!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } T *ptr = arr.get(); @@ -273,7 +273,7 @@ namespace cuda writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) { if (!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } T *ptr = arr.get(); diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index bd576ca88a..002c1d5b82 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -286,7 +286,7 @@ namespace opencl writeHostDataArray(Array &arr, const T * const data, const size_t bytes) { if (!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } getQueue().enqueueWriteBuffer(*arr.get(), CL_TRUE, @@ -302,7 +302,7 @@ namespace opencl writeDeviceDataArray(Array &arr, const void * const data, const size_t bytes) { if (!arr.isOwner()) { - arr = createEmptyArray(arr.dims()); + arr = copyArray(arr); } cl::Buffer& buf = *arr.get(); From 49a18e04f17de4e3f2d8a08fa2a6b9ff7c120a18 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 22 Feb 2016 14:12:45 -0500 Subject: [PATCH 271/288] Change clBLAS commit tag to af3.3.0 --- CMakeModules/build_clBLAS.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeModules/build_clBLAS.cmake b/CMakeModules/build_clBLAS.cmake index d486b31801..2289c26393 100644 --- a/CMakeModules/build_clBLAS.cmake +++ b/CMakeModules/build_clBLAS.cmake @@ -14,7 +14,7 @@ ENDIF() ExternalProject_Add( clBLAS-ext GIT_REPOSITORY https://github.com/arrayfire/clBLAS.git - GIT_TAG arrayfire-release-test + GIT_TAG af3.3.0 PREFIX "${prefix}" INSTALL_DIR "${prefix}" UPDATE_COMMAND "" From 31b864314ca4a772e7620d53363769a7fbfa06aa Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 22 Feb 2016 14:12:53 -0500 Subject: [PATCH 272/288] Change clFFT commit tag to af3.3.0 --- CMakeModules/build_clFFT.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeModules/build_clFFT.cmake b/CMakeModules/build_clFFT.cmake index 961347f913..2ab9ccc1ea 100644 --- a/CMakeModules/build_clFFT.cmake +++ b/CMakeModules/build_clFFT.cmake @@ -14,7 +14,7 @@ ENDIF() ExternalProject_Add( clFFT-ext GIT_REPOSITORY https://github.com/arrayfire/clFFT.git - GIT_TAG arrayfire-release-test + GIT_TAG af3.3.0 PREFIX "${prefix}" INSTALL_DIR "${prefix}" UPDATE_COMMAND "" From 513e7115b2699f15aff17fafbbd5f9fde38e383c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 22 Feb 2016 16:34:38 -0500 Subject: [PATCH 273/288] Fixes for using MKL on OSX --- src/backend/cblas.cpp | 8 +++--- src/backend/cpu/CMakeLists.txt | 13 +++++---- src/backend/cpu/blas.hpp | 16 +++++------ src/backend/cpu/lapack_helper.hpp | 20 ++++++------- src/backend/cuda/CMakeLists.txt | 9 ++++-- src/backend/cuda/cpu_lapack/lapack_helper.hpp | 20 ++++++------- src/backend/opencl/CMakeLists.txt | 13 +++++---- src/backend/opencl/cpu/cpu_helper.hpp | 28 +++++++++---------- src/backend/opencl/magma/magma_cpu_blas.h | 16 +++++------ src/backend/opencl/magma/magma_cpu_lapack.h | 18 +++++++----- 10 files changed, 86 insertions(+), 75 deletions(-) diff --git a/src/backend/cblas.cpp b/src/backend/cblas.cpp index 4d99d457c2..1be15e47c9 100644 --- a/src/backend/cblas.cpp +++ b/src/backend/cblas.cpp @@ -12,11 +12,11 @@ #ifdef AF_CPU #include #else - #ifdef __APPLE__ - #include + #ifdef USE_MKL + #include #else - #ifdef USE_MKL - #include + #ifdef __APPLE__ + #include #else extern "C" { #include diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index 8ada1d6935..2032f0b7e9 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -14,9 +14,14 @@ IF(USE_CPU_F77_BLAS) ADD_DEFINITIONS(-DUSE_F77_BLAS) ENDIF() -IF(USE_CPU_MKL) - MESSAGE("Using MKL") +IF(USE_CPU_MKL) # Manual MKL Setup + MESSAGE("CPU Backend Using MKL") ADD_DEFINITIONS(-DUSE_MKL) +ELSE(USE_CPU_MKL) + IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS + MESSAGE("CPU Backend Using MKL RT") + ADD_DEFINITIONS(-DUSE_MKL) + ENDIF() ENDIF() IF (NOT CBLAS_LIBRARIES) @@ -29,10 +34,6 @@ IF(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU" AND "${APPLE}") ADD_DEFINITIONS(-flax-vector-conversions) ENDIF() -IF(${MKL_FOUND}) - ADD_DEFINITIONS(-DUSE_MKL) -ENDIF() - FIND_PACKAGE(FFTW REQUIRED) MESSAGE(STATUS "FFTW Found ? ${FFTW_FOUND}") MESSAGE(STATUS "FFTW Library: ${FFTW_LIBRARIES}") diff --git a/src/backend/cpu/blas.hpp b/src/backend/cpu/blas.hpp index 05484338cd..3f5b7451ad 100644 --- a/src/backend/cpu/blas.hpp +++ b/src/backend/cpu/blas.hpp @@ -12,16 +12,16 @@ #include #include -#ifdef __APPLE__ -#include -#else #ifdef USE_MKL -#include + #include #else -extern "C" { -#include -} -#endif + #ifdef __APPLE__ + #include + #else + extern "C" { + #include + } + #endif #endif // TODO: Ask upstream for a more official way to detect it diff --git a/src/backend/cpu/lapack_helper.hpp b/src/backend/cpu/lapack_helper.hpp index f978ecb92b..c5ed4fa83f 100644 --- a/src/backend/cpu/lapack_helper.hpp +++ b/src/backend/cpu/lapack_helper.hpp @@ -17,17 +17,17 @@ #define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR #define LAPACK_NAME(fn) LAPACKE_##fn -#ifdef __APPLE__ -#include -#include -#undef AF_LAPACK_COL_MAJOR -#define AF_LAPACK_COL_MAJOR 0 -#else #ifdef USE_MKL -#include -#else // NETLIB LAPACKE -#include -#endif + #include +#else + #ifdef __APPLE__ + #include + #include + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 + #else // NETLIB LAPACKE + #include + #endif #endif #endif diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index 81d6ba243c..8cecd812f2 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -112,9 +112,14 @@ ELSE(CUDA_cusolver_LIBRARY) ELSE(NOT LAPACK_FOUND) MESSAGE(STATUS "CUDA Version ${CUDA_VERSION_STRING} does not contain cusolver library. But CPU LAPACK libraries are available. Will fallback to using host side code.") ADD_DEFINITIONS(-DWITH_CPU_LINEAR_ALGEBRA) - IF(USE_CUDA_MKL) - MESSAGE("Using MKL") + IF(USE_CUDA_MKL) # Manual MKL Setup + MESSAGE("CUDA LAPACK CPU Fallback Using MKL") ADD_DEFINITIONS(-DUSE_MKL) + ELSE(USE_CUDA_MKL) + IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS + MESSAGE("CUDA LAPACK CPU Fallback Using MKL RT") + ADD_DEFINITIONS(-DUSE_MKL) + ENDIF() ENDIF() ENDIF() ELSE() diff --git a/src/backend/cuda/cpu_lapack/lapack_helper.hpp b/src/backend/cuda/cpu_lapack/lapack_helper.hpp index 58265871c2..b85a80b10c 100644 --- a/src/backend/cuda/cpu_lapack/lapack_helper.hpp +++ b/src/backend/cuda/cpu_lapack/lapack_helper.hpp @@ -19,17 +19,17 @@ #define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR #define LAPACK_NAME(fn) LAPACKE_##fn -#ifdef __APPLE__ -#include -#include -#undef AF_LAPACK_COL_MAJOR -#define AF_LAPACK_COL_MAJOR 0 -#else #ifdef USE_MKL -#include -#else // NETLIB LAPACKE -#include -#endif + #include +#else + #ifdef __APPLE__ + #include + #include + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 + #else // NETLIB LAPACKE + #include + #endif #endif #endif diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index e598a973df..ce45c4bdaa 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -10,9 +10,14 @@ IF(USE_OPENCL_F77_BLAS) ADD_DEFINITIONS(-DUSE_F77_BLAS) ENDIF() -IF(USE_OPENCL_MKL) - MESSAGE("Using MKL") +IF(USE_OPENCL_MKL) # Manual MKL Setup + MESSAGE("OpenCL Backend Using MKL") ADD_DEFINITIONS(-DUSE_MKL) +ELSE(USE_OPENCL_MKL) + IF(${MKL_FOUND}) # Automatic MKL Setup from BLAS + MESSAGE("OpenCL Backend Using MKL RT") + ADD_DEFINITIONS(-DUSE_MKL) + ENDIF() ENDIF() IF(APPLE) @@ -42,10 +47,6 @@ ELSE(NOT LAPACK_FOUND) ENDIF() ENDIF() -IF(${MKL_FOUND}) - ADD_DEFINITIONS(-DUSE_MKL) -ENDIF() - IF(NOT UNIX) ADD_DEFINITIONS(-DAFDLL) ENDIF() diff --git a/src/backend/opencl/cpu/cpu_helper.hpp b/src/backend/opencl/cpu/cpu_helper.hpp index cbdc470e19..f7f690322c 100644 --- a/src/backend/opencl/cpu/cpu_helper.hpp +++ b/src/backend/opencl/cpu/cpu_helper.hpp @@ -29,32 +29,32 @@ #define AF_LAPACK_COL_MAJOR LAPACK_COL_MAJOR #define LAPACK_NAME(fn) LAPACKE_##fn -#ifdef __APPLE__ - #include - #include - #undef AF_LAPACK_COL_MAJOR - #define AF_LAPACK_COL_MAJOR 0 +#ifdef USE_MKL + #include #else - #ifdef USE_MKL - #include - #else + #ifdef __APPLE__ + #include + #include + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 + #else // NETLIB LAPACKE #include #endif -#endif //OS +#endif #endif // WITH_OPENCL_LINEAR_ALGEBRA //********************************************************/ // BLAS //********************************************************/ -#ifdef __APPLE__ - #include +#ifdef USE_MKL + #include #else - #ifdef USE_MKL - #include + #ifdef __APPLE__ + #include #else extern "C" { - #include + #include } #endif #endif diff --git a/src/backend/opencl/magma/magma_cpu_blas.h b/src/backend/opencl/magma/magma_cpu_blas.h index b3cba096b5..6661aad657 100644 --- a/src/backend/opencl/magma/magma_cpu_blas.h +++ b/src/backend/opencl/magma/magma_cpu_blas.h @@ -13,16 +13,16 @@ #include #include "magma_types.h" -#ifdef __APPLE__ -#include -#else #ifdef USE_MKL -#include + #include #else -extern "C" { -#include -} -#endif + #ifdef __APPLE__ + #include + #else + extern "C" { + #include + } + #endif #endif // Todo: Ask upstream for a more official way to detect it diff --git a/src/backend/opencl/magma/magma_cpu_lapack.h b/src/backend/opencl/magma/magma_cpu_lapack.h index 5974dab8a9..54c26ae0e9 100644 --- a/src/backend/opencl/magma/magma_cpu_lapack.h +++ b/src/backend/opencl/magma/magma_cpu_lapack.h @@ -39,16 +39,20 @@ int LAPACKE_dlacgv_work(Args... args) { return 0; } #define ORDER_TYPE int #define LAPACK_NAME(fn) LAPACKE_##fn -#if defined(__APPLE__) - #define LAPACK_COL_MAJOR 102 - #include "../../lapacke.hpp" +#ifdef USE_MKL + #include #else - #ifdef USE_MKL - #include + #ifdef __APPLE__ + #include + #include + #undef LAPACK_COL_MAJOR + #define LAPACK_COL_MAJOR 102 + #undef AF_LAPACK_COL_MAJOR + #define AF_LAPACK_COL_MAJOR 0 #else // NETLIB LAPACKE #include - #endif // MKL/NETLIB -#endif //APPLE + #endif +#endif #define LAPACKE_CHECK(fn) do { \ int __info = fn; \ From cc59efa6f88dbf03682a7e3e3af09587adc98dc8 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 22 Feb 2016 16:40:32 -0500 Subject: [PATCH 274/288] Fix to MemoryManager in debug mode --- src/backend/MemoryManager.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index b66dfc33e7..a761162cdc 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -122,6 +122,8 @@ void MemoryManager::unlock(void *ptr, bool user_unlock) // Just free memory in debug mode if ((iter->second).bytes > 0) { this->nativeFree(iter->first); + current.total_buffers--; + current.total_bytes -= iter->second.bytes; } } else { // In regular mode, move buffer to free map From d6d08f96384a1b1b441f61caafe3a41001eb0482 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 22 Feb 2016 16:40:54 -0500 Subject: [PATCH 275/288] Clear the free_map after calling garbageCollect in MemoryManager --- src/backend/MemoryManager.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index a761162cdc..a2bbb7d628 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -87,6 +87,7 @@ void MemoryManager::garbageCollect() kv.second.pop_back(); } } + current.free_map.clear(); } void MemoryManager::unlock(void *ptr, bool user_unlock) From 6325406afa60d5d98a2c4e349fb92e994b33fbab Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 22 Feb 2016 16:42:11 -0500 Subject: [PATCH 276/288] Exit early when destructor is called on empty arrays. This should speed things up when a lot of buffers are present in the MemoryManager. --- src/backend/MemoryManager.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/backend/MemoryManager.cpp b/src/backend/MemoryManager.cpp index a2bbb7d628..5910eabeea 100644 --- a/src/backend/MemoryManager.cpp +++ b/src/backend/MemoryManager.cpp @@ -92,6 +92,9 @@ void MemoryManager::garbageCollect() void MemoryManager::unlock(void *ptr, bool user_unlock) { + // Shortcut for empty arrays + if (!ptr) return; + lock_guard_t lock(this->memory_mutex); memory_info& current = this->getCurrentMemoryInfo(); From d4fb656e09dd89994ae65bf643cfefe2f58f3f47 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 22 Feb 2016 17:59:53 -0500 Subject: [PATCH 277/288] Added support for finding MKL RT on OSX for BLAS, LAPACKE, FFTW * Uses INTEL_MKL_ROOT as enviornment variable. Commonly /opt/intel/mkl * If using RT, then add /opt/intel/mkl/lib and /opt/intel/compilers_and_libraries/mac/lib/ to DYLD_LIBRARY_PATH --- CMakeModules/FindCBLAS.cmake | 26 +++++++++++++++++--------- CMakeModules/FindFFTW.cmake | 29 ++++++++++++++++++++++++----- CMakeModules/FindLAPACKE.cmake | 8 ++++++-- src/backend/cpu/CMakeLists.txt | 10 +++++++++- src/backend/cuda/CMakeLists.txt | 10 +++++++++- src/backend/opencl/CMakeLists.txt | 10 +++++++++- 6 files changed, 74 insertions(+), 19 deletions(-) diff --git a/CMakeModules/FindCBLAS.cmake b/CMakeModules/FindCBLAS.cmake index efef36b093..db1d783e9e 100644 --- a/CMakeModules/FindCBLAS.cmake +++ b/CMakeModules/FindCBLAS.cmake @@ -62,28 +62,36 @@ IF(NOT CBLAS_ROOT_DIR) IF (ENV{CBLASDIR}) SET(CBLAS_ROOT_DIR $ENV{CBLASDIR}) IF ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(CBLAS_LIB64_DIR "${INTEL_MKL_ROOT_DIR}/lib64") + SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib64") ELSE() - SET(CBLAS_LIB32_DIR "${INTEL_MKL_ROOT_DIR}/lib") + SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib") ENDIF() ENDIF() IF (ENV{CBLAS_ROOT_DIR}) SET(CBLAS_ROOT_DIR $ENV{CBLAS_ROOT_DIR}) IF ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(CBLAS_LIB64_DIR "${INTEL_MKL_ROOT_DIR}/lib64") + SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib64") ELSE() - SET(CBLAS_LIB32_DIR "${INTEL_MKL_ROOT_DIR}/lib") + SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib") ENDIF() ENDIF() IF (INTEL_MKL_ROOT_DIR) SET(CBLAS_ROOT_DIR ${INTEL_MKL_ROOT_DIR}) - IF ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(CBLAS_LIB64_DIR "${INTEL_MKL_ROOT_DIR}/lib/intel64") - ELSE() - SET(CBLAS_LIB32_DIR "${INTEL_MKL_ROOT_DIR}/lib/ia32") - ENDIF() + IF(APPLE) + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib") + ELSE() + SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib") + ENDIF() + ELSE(APPLE) # Windows and Linux + IF ("${SIZE_OF_VOIDP}" EQUAL 8) + SET(CBLAS_LIB64_DIR "${CBLAS_ROOT_DIR}/lib/intel64") + ELSE() + SET(CBLAS_LIB32_DIR "${CBLAS_ROOT_DIR}/lib/ia32") + ENDIF() + ENDIF(APPLE) ENDIF() SET(CBLAS_INCLUDE_DIR "${CBLAS_ROOT_DIR}/include") diff --git a/CMakeModules/FindFFTW.cmake b/CMakeModules/FindFFTW.cmake index a725f64ecd..3156cec89b 100644 --- a/CMakeModules/FindFFTW.cmake +++ b/CMakeModules/FindFFTW.cmake @@ -24,6 +24,25 @@ IF(NOT FFTW_ROOT AND ENV{FFTWDIR}) SET(FFTW_ROOT $ENV{FFTWDIR}) ENDIF() +IF (NOT INTEL_MKL_ROOT_DIR) + SET(INTEL_MKL_ROOT_DIR $ENV{INTEL_MKL_ROOT}) +ENDIF() + +IF(NOT FFTW_ROOT) + + IF (ENV{FFTWDIR}) + SET(FFTW_ROOT $ENV{FFTWDIR}) + ENDIF() + + IF (ENV{FFTW_ROOT_DIR}) + SET(FFTW_ROOT $ENV{FFTW_ROOT_DIR}) + ENDIF() + + IF (INTEL_MKL_ROOT_DIR) + SET(FFTW_ROOT ${INTEL_MKL_ROOT_DIR}) + ENDIF() +ENDIF() + # Check if we can use PkgConfig FIND_PACKAGE(PkgConfig) @@ -44,14 +63,14 @@ IF(FFTW_ROOT) #find libs FIND_LIBRARY( FFTW_LIB - NAMES "fftw3" "libfftw3-3" "fftw3-3" + NAMES "fftw3" "libfftw3-3" "fftw3-3" "mkl_rt" PATHS ${FFTW_ROOT} PATH_SUFFIXES "lib" "lib64" NO_DEFAULT_PATH ) FIND_LIBRARY( FFTWF_LIB - NAMES "fftw3f" "libfftw3f-3" "fftw3f-3" + NAMES "fftw3f" "libfftw3f-3" "fftw3f-3" "mkl_rt" PATHS ${FFTW_ROOT} PATH_SUFFIXES "lib" "lib64" NO_DEFAULT_PATH @@ -62,18 +81,18 @@ IF(FFTW_ROOT) FFTW_INCLUDES NAMES "fftw3.h" PATHS ${FFTW_ROOT} - PATH_SUFFIXES "include" + PATH_SUFFIXES "include" "include/fftw" NO_DEFAULT_PATH ) ELSE() FIND_LIBRARY( FFTW_LIB - NAMES "fftw3" + NAMES "fftw3" "mkl_rt" PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR} ) FIND_LIBRARY( FFTWF_LIB - NAMES "fftw3f" + NAMES "fftw3f" "mkl_rt" PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR} ) FIND_PATH( diff --git a/CMakeModules/FindLAPACKE.cmake b/CMakeModules/FindLAPACKE.cmake index dc4a045370..0732cfaa83 100644 --- a/CMakeModules/FindLAPACKE.cmake +++ b/CMakeModules/FindLAPACKE.cmake @@ -141,8 +141,12 @@ ELSE(PC_LAPACKE_FOUND) ENDIF(LAPACKE_ROOT_DIR) ENDIF(PC_LAPACKE_FOUND) -SET(LAPACK_LIBRARIES ${LAPACKE_LIB} ${LAPACK_LIB}) -SET(LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDES}) +IF(LAPACKE_LIB AND LAPACK_LIB) + SET(LAPACK_LIBRARIES ${LAPACKE_LIB} ${LAPACK_LIB}) +ENDIF() +IF(LAPACKE_INCLUDES) + SET(LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDES}) +ENDIF() INCLUDE(FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(LAPACK DEFAULT_MSG diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index 2032f0b7e9..9387323592 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -39,7 +39,15 @@ MESSAGE(STATUS "FFTW Found ? ${FFTW_FOUND}") MESSAGE(STATUS "FFTW Library: ${FFTW_LIBRARIES}") IF(APPLE) - FIND_PACKAGE(LAPACK) + FIND_PACKAGE(LAPACKE QUIET) # For finding MKL + IF(NOT LAPACK_FOUND) + # UNSET THE VARIABLES FROM LAPACKE + UNSET(LAPACKE_LIB CACHE) + UNSET(LAPACK_LIB CACHE) + UNSET(LAPACKE_INCLUDES CACHE) + UNSET(LAPACKE_ROOT_DIR CACHE) + FIND_PACKAGE(LAPACK) + ENDIF() ELSE(APPLE) # Linux and Windows FIND_PACKAGE(LAPACKE) ENDIF(APPLE) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index 8cecd812f2..ae0690dba2 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -102,7 +102,15 @@ ELSE(CUDA_cusolver_LIBRARY) IF(${CUDA_LAPACK_CPU_FALLBACK}) ## Try to use CPU side lapack IF(APPLE) - FIND_PACKAGE(LAPACK) + FIND_PACKAGE(LAPACKE QUIET) # For finding MKL + IF(NOT LAPACK_FOUND) + # UNSET THE VARIABLES FROM LAPACKE + UNSET(LAPACKE_LIB CACHE) + UNSET(LAPACK_LIB CACHE) + UNSET(LAPACKE_INCLUDES CACHE) + UNSET(LAPACKE_ROOT_DIR CACHE) + FIND_PACKAGE(LAPACK) + ENDIF() ELSE(APPLE) # Linux and Windows FIND_PACKAGE(LAPACKE) ENDIF(APPLE) diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index ce45c4bdaa..bbe430df15 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -21,7 +21,15 @@ ELSE(USE_OPENCL_MKL) ENDIF() IF(APPLE) - FIND_PACKAGE(LAPACK) + FIND_PACKAGE(LAPACKE QUIET) # For finding MKL + IF(NOT LAPACK_FOUND) + # UNSET THE VARIABLES FROM LAPACKE + UNSET(LAPACKE_LIB CACHE) + UNSET(LAPACK_LIB CACHE) + UNSET(LAPACKE_INCLUDES CACHE) + UNSET(LAPACKE_ROOT_DIR CACHE) + FIND_PACKAGE(LAPACK) + ENDIF() ELSE(APPLE) # Linux and Windows FIND_PACKAGE(LAPACKE) ENDIF(APPLE) From 96baaf9b33bc33a661cde292e3aed9074b85251e Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 23 Feb 2016 14:31:33 -0500 Subject: [PATCH 278/288] Fixes to and reorganization of OSX Installer * ArrayFire.config and forge are now installed with the library component * ArrayFireConfig.cmake files are installed by all repos * Examples and Documentation and now independent components * No changes to install paths * When make osx_installer is called, it creates a new directory called osx_install_files which are then used to generate the installers * osx_installer target depends on make install being called first --- CMakeModules/osx_install/OSXInstaller.cmake | 109 ++++++++++++++++---- CMakeModules/osx_install/distribution.dist | 21 ++-- CMakeModules/osx_install/readme.html | 13 +-- 3 files changed, 102 insertions(+), 41 deletions(-) diff --git a/CMakeModules/osx_install/OSXInstaller.cmake b/CMakeModules/osx_install/OSXInstaller.cmake index dc3a8b2491..d79d68f2b6 100644 --- a/CMakeModules/osx_install/OSXInstaller.cmake +++ b/CMakeModules/osx_install/OSXInstaller.cmake @@ -8,8 +8,62 @@ SET(BIN2CPP_PROGRAM "bin2cpp") SET(OSX_INSTALL_DIR ${CMAKE_MODULE_PATH}/osx_install) +################################################################################ +## Create Directory Structure +################################################################################ +SET(OSX_TEMP "${CMAKE_BINARY_DIR}/osx_install_files") + +FILE(GLOB COMMONLIB "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_LIB_DIR}/libforge*.dylib") +FILE(GLOB COMMONCMAKE "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_CMAKE_DIR}/ArrayFireConfig*.cmake") + +MACRO(OSX_INSTALL_SETUP BACKEND LIB) + FILE(GLOB ${BACKEND}LIB "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_LIB_DIR}/lib${LIB}*.dylib") + FILE(GLOB ${BACKEND}CMAKE "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_CMAKE_DIR}/ArrayFire${BACKEND}*.cmake") + + ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_${BACKEND}) + FOREACH(SRC ${${BACKEND}LIB} ${COMMONLIB} ${${BACKEND}CMAKE} ${COMMONCMAKE}) + FILE(RELATIVE_PATH SRC_REL ${CMAKE_INSTALL_PREFIX} ${SRC}) + ADD_CUSTOM_COMMAND(TARGET OSX_INSTALL_SETUP_${BACKEND} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${SRC} "${OSX_TEMP}/${BACKEND}/${SRC_REL}" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying ${BACKEND} files to temporary OSX Install Dir" + ) + ENDFOREACH() +ENDMACRO(OSX_INSTALL_SETUP) + +OSX_INSTALL_SETUP(CPU afcpu) +OSX_INSTALL_SETUP(CUDA afcuda) +OSX_INSTALL_SETUP(OpenCL afopencl) +OSX_INSTALL_SETUP(Unified af) + +# Headers +ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_INCLUDE + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_INSTALL_PREFIX}/include "${OSX_TEMP}/include" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying header files to temporary OSX Install Dir" + ) + +# Examples +ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_EXAMPLES + COMMAND ${CMAKE_COMMAND} -E copy_directory + "${CMAKE_INSTALL_PREFIX}/share/ArrayFire/examples" "${OSX_TEMP}/examples" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying examples files to temporary OSX Install Dir" + ) + +# Documentation +ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_DOC + COMMAND ${CMAKE_COMMAND} -E copy_directory + "${CMAKE_INSTALL_PREFIX}/share/ArrayFire/doc" "${OSX_TEMP}/doc" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying documentation files to temporary OSX Install Dir" + ) +################################################################################ + FUNCTION(PKG_BUILD) - CMAKE_PARSE_ARGUMENTS(ARGS "" "INSTALL_LOCATION;IDENTIFIER;PATH_TO_FILES;PKG_NAME;TARGETS;SCRIPT_DIR" "FILTERS" ${ARGN}) + CMAKE_PARSE_ARGUMENTS(ARGS "" "DEPENDS;INSTALL_LOCATION;IDENTIFIER;PATH_TO_FILES;PKG_NAME;TARGETS;SCRIPT_DIR" "FILTERS" ${ARGN}) FOREACH(filter ${ARGS_FILTERS}) LIST(APPEND FILTER_LIST --filter ${filter}) @@ -70,50 +124,61 @@ ENDFUNCTION(PRODUCT_BUILD) PKG_BUILD( PKG_NAME ArrayFireCPU - DEPENDS afcpu + DEPENDS OSX_INSTALL_SETUP_CPU TARGETS cpu_package - INSTALL_LOCATION /usr/local/lib + INSTALL_LOCATION /usr/local SCRIPT_DIR ${OSX_INSTALL_DIR}/cpu_scripts IDENTIFIER com.arrayfire.pkg.arrayfire.cpu.lib - PATH_TO_FILES package/lib + PATH_TO_FILES ${OSX_TEMP}/CPU FILTERS opencl cuda unified) PKG_BUILD( PKG_NAME ArrayFireCUDA - DEPENDS afcuda + DEPENDS OSX_INSTALL_SETUP_CUDA TARGETS cuda_package - INSTALL_LOCATION /usr/local/lib + INSTALL_LOCATION /usr/local SCRIPT_DIR ${OSX_INSTALL_DIR}/cuda_scripts IDENTIFIER com.arrayfire.pkg.arrayfire.cuda.lib - PATH_TO_FILES package/lib + PATH_TO_FILES ${OSX_TEMP}/CUDA FILTERS cpu opencl unified) PKG_BUILD( PKG_NAME ArrayFireOPENCL - DEPENDS afopencl + DEPENDS OSX_INSTALL_SETUP_OpenCL TARGETS opencl_package - INSTALL_LOCATION /usr/local/lib + INSTALL_LOCATION /usr/local IDENTIFIER com.arrayfire.pkg.arrayfire.opencl.lib - PATH_TO_FILES package/lib + PATH_TO_FILES ${OSX_TEMP}/OpenCL FILTERS cpu cuda unified) PKG_BUILD( PKG_NAME ArrayFireUNIFIED - DEPENDS af + DEPENDS OSX_INSTALL_SETUP_Unified TARGETS unified_package - INSTALL_LOCATION /usr/local/lib + INSTALL_LOCATION /usr/local IDENTIFIER com.arrayfire.pkg.arrayfire.unified.lib - PATH_TO_FILES package/lib + PATH_TO_FILES ${OSX_TEMP}/Unified FILTERS cpu cuda opencl) PKG_BUILD( PKG_NAME ArrayFireHeaders + DEPENDS OSX_INSTALL_SETUP_INCLUDE TARGETS header_package INSTALL_LOCATION /usr/local/include IDENTIFIER com.arrayfire.pkg.arrayfire.inc - PATH_TO_FILES package/include) - -PKG_BUILD( PKG_NAME ArrayFireExtra - TARGETS extra_package - INSTALL_LOCATION /usr/local/share - IDENTIFIER com.arrayfire.pkg.arrayfire.extra - PATH_TO_FILES package/share) - -PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${header_package} ${extra_package}) + PATH_TO_FILES ${OSX_TEMP}/include) + +PKG_BUILD( PKG_NAME ArrayFireExamples + DEPENDS OSX_INSTALL_SETUP_EXAMPLES + TARGETS examples_package + INSTALL_LOCATION /usr/local/share/ArrayFire/examples + IDENTIFIER com.arrayfire.pkg.arrayfire.examples + PATH_TO_FILES ${OSX_TEMP}/examples + FILTERS cmake) + +PKG_BUILD( PKG_NAME ArrayFireDoc + DEPENDS OSX_INSTALL_SETUP_DOC + TARGETS doc_package + INSTALL_LOCATION /usr/local/share/ArrayFire/doc + IDENTIFIER com.arrayfire.pkg.arrayfire.doc + PATH_TO_FILES ${OSX_TEMP}/doc + FILTERS cmake) + +PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${header_package} ${examples_package} ${doc_package}) diff --git a/CMakeModules/osx_install/distribution.dist b/CMakeModules/osx_install/distribution.dist index 3dc82379c9..6c460a6a26 100644 --- a/CMakeModules/osx_install/distribution.dist +++ b/CMakeModules/osx_install/distribution.dist @@ -17,7 +17,8 @@ ArrayFireOPENCL.pkg ArrayFireUNIFIED.pkg ArrayFireHeaders.pkg - ArrayFireExtra.pkg + ArrayFireExamples.pkg + ArrayFireDoc.pkg @@ -27,26 +28,27 @@ - + + - - - + + + + + diff --git a/CMakeModules/osx_install/readme.html b/CMakeModules/osx_install/readme.html index 41d4ab8cf0..482b7add7e 100644 --- a/CMakeModules/osx_install/readme.html +++ b/CMakeModules/osx_install/readme.html @@ -5,18 +5,9 @@

Install Directories

  • Libraries will be installed in /usr/local/lib
  • Headers will be installed in /usr/local/include
  • -
  • Docs and other files will be installed in /usr/local/share
  • -
- -

Major Updates

-
    -
  • ArrayFire is now open source
  • -
  • Major changes to the visualization library
  • -
  • Introducing handle based C API
  • -
  • New backend: CPU fallback available for systems without GPUs
  • -
  • Dense linear algebra functions available for all backends
  • -
  • Support for 64 bit integers
  • +
  • Examples, documentation and CMake config files will be installed in /usr/local/share
+

For complete list of updates, visit ArrayFire Release Notes

From 529b638ad1d0f683c093ca9945aa89ec3d1cc59f Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 23 Feb 2016 15:20:47 -0500 Subject: [PATCH 279/288] OSX Installer: Move libforge and ArrayFireConfig into common sub package * libforge and ArrayFireConfig.cmake files are now in a common package * This package is no visible at install time * The package is enabled if any of the backends are enabled (like unified) * This is done so that the common files are installed only once rather than by each backend package --- CMakeModules/osx_install/OSXInstaller.cmake | 25 +++++++++++++++++++-- CMakeModules/osx_install/distribution.dist | 14 ++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/CMakeModules/osx_install/OSXInstaller.cmake b/CMakeModules/osx_install/OSXInstaller.cmake index d79d68f2b6..b2514f8e2a 100644 --- a/CMakeModules/osx_install/OSXInstaller.cmake +++ b/CMakeModules/osx_install/OSXInstaller.cmake @@ -13,15 +13,28 @@ SET(OSX_INSTALL_DIR ${CMAKE_MODULE_PATH}/osx_install) ################################################################################ SET(OSX_TEMP "${CMAKE_BINARY_DIR}/osx_install_files") +# Common files - libforge, ArrayFireConfig*.cmake FILE(GLOB COMMONLIB "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_LIB_DIR}/libforge*.dylib") FILE(GLOB COMMONCMAKE "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_CMAKE_DIR}/ArrayFireConfig*.cmake") +ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_COMMON) +FOREACH(SRC ${COMMONLIB} ${COMMONCMAKE}) + FILE(RELATIVE_PATH SRC_REL ${CMAKE_INSTALL_PREFIX} ${SRC}) + ADD_CUSTOM_COMMAND(TARGET OSX_INSTALL_SETUP_COMMON PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${SRC} "${OSX_TEMP}/common/${SRC_REL}" + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Copying Common files to temporary OSX Install Dir" + ) +ENDFOREACH() + +# Backends - CPU, CUDA, OpenCL, Unified MACRO(OSX_INSTALL_SETUP BACKEND LIB) FILE(GLOB ${BACKEND}LIB "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_LIB_DIR}/lib${LIB}*.dylib") FILE(GLOB ${BACKEND}CMAKE "${CMAKE_INSTALL_PREFIX}/${AF_INSTALL_CMAKE_DIR}/ArrayFire${BACKEND}*.cmake") ADD_CUSTOM_TARGET(OSX_INSTALL_SETUP_${BACKEND}) - FOREACH(SRC ${${BACKEND}LIB} ${COMMONLIB} ${${BACKEND}CMAKE} ${COMMONCMAKE}) + FOREACH(SRC ${${BACKEND}LIB} ${${BACKEND}CMAKE}) FILE(RELATIVE_PATH SRC_REL ${CMAKE_INSTALL_PREFIX} ${SRC}) ADD_CUSTOM_COMMAND(TARGET OSX_INSTALL_SETUP_${BACKEND} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E copy @@ -157,6 +170,14 @@ PKG_BUILD( PKG_NAME ArrayFireUNIFIED PATH_TO_FILES ${OSX_TEMP}/Unified FILTERS cpu cuda opencl) +PKG_BUILD( PKG_NAME ArrayFireCommon + DEPENDS OSX_INSTALL_SETUP_COMMON + TARGETS common_package + INSTALL_LOCATION /usr/local + IDENTIFIER com.arrayfire.pkg.arrayfire.libcommon + PATH_TO_FILES ${OSX_TEMP}/common + FILTERS cpu cuda opencl unified) + PKG_BUILD( PKG_NAME ArrayFireHeaders DEPENDS OSX_INSTALL_SETUP_INCLUDE TARGETS header_package @@ -180,5 +201,5 @@ PKG_BUILD( PKG_NAME ArrayFireDoc PATH_TO_FILES ${OSX_TEMP}/doc FILTERS cmake) -PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${header_package} ${examples_package} ${doc_package}) +PRODUCT_BUILD(DEPENDS ${cpu_package} ${cuda_package} ${opencl_package} ${unified_package} ${common_package} ${header_package} ${examples_package} ${doc_package}) diff --git a/CMakeModules/osx_install/distribution.dist b/CMakeModules/osx_install/distribution.dist index 6c460a6a26..b476bf013f 100644 --- a/CMakeModules/osx_install/distribution.dist +++ b/CMakeModules/osx_install/distribution.dist @@ -19,6 +19,7 @@ ArrayFireHeaders.pkg ArrayFireExamples.pkg ArrayFireDoc.pkg + ArrayFireCommon.pkg @@ -26,14 +27,15 @@ + - @@ -55,6 +57,14 @@ enabled="CheckBackendSelected()"> + + + From 571f0caed833a5fc1e2ff2629f47bd15b273c86c Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 23 Feb 2016 17:00:45 -0500 Subject: [PATCH 280/288] Renaming ambiguous getInfo to getDeviceInfo --- src/api/c/device.cpp | 6 +++--- src/backend/cpu/platform.cpp | 2 +- src/backend/cpu/platform.hpp | 2 +- src/backend/cuda/platform.cpp | 24 ++++++++++++------------ src/backend/cuda/platform.hpp | 7 ++----- src/backend/opencl/platform.cpp | 2 +- src/backend/opencl/platform.hpp | 4 ++-- 7 files changed, 22 insertions(+), 25 deletions(-) diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 937b0a66c5..6c089f57c0 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -65,7 +65,7 @@ af_err af_init() try { static bool first = true; if(first) { - getInfo(); + getDeviceInfo(); first = false; } } CATCHALL; @@ -75,7 +75,7 @@ af_err af_init() af_err af_info() { try { - printf("%s", getInfo().c_str()); + printf("%s", getDeviceInfo().c_str()); } CATCHALL; return AF_SUCCESS; } @@ -83,7 +83,7 @@ af_err af_info() af_err af_info_string(char **str, const bool verbose) { try { - std::string infoStr = getInfo(); + std::string infoStr = getDeviceInfo(); af_alloc_host((void**)str, sizeof(char) * (infoStr.size() + 1)); // Need to do a deep copy diff --git a/src/backend/cpu/platform.cpp b/src/backend/cpu/platform.cpp index 7e6bc81e43..9474c792f3 100644 --- a/src/backend/cpu/platform.cpp +++ b/src/backend/cpu/platform.cpp @@ -224,7 +224,7 @@ static inline std::string <rim(std::string &s) return s; } -std::string getInfo() +std::string getDeviceInfo() { std::ostringstream info; static CPUInfo cinfo; diff --git a/src/backend/cpu/platform.hpp b/src/backend/cpu/platform.hpp index 82ed42c8f9..7caddccc72 100644 --- a/src/backend/cpu/platform.hpp +++ b/src/backend/cpu/platform.hpp @@ -16,7 +16,7 @@ namespace cpu { int getBackend(); - std::string getInfo(); + std::string getDeviceInfo(); bool isDoubleSupported(int device); diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp index 67f3f08428..10cfdc886c 100644 --- a/src/backend/cuda/platform.cpp +++ b/src/backend/cuda/platform.cpp @@ -150,18 +150,6 @@ int getBackend() return AF_BACKEND_CUDA; } -string getInfo() -{ - ostringstream info; - info << "ArrayFire v" << AF_VERSION - << " (CUDA, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; - info << getPlatformInfo(); - for (int i = 0; i < getDeviceCount(); ++i) { - info << getDeviceInfo(i); - } - return info.str(); -} - string getDeviceInfo(int device) { cudaDeviceProp dev = getDeviceProp(device); @@ -186,6 +174,18 @@ string getDeviceInfo(int device) return info; } +string getDeviceInfo() +{ + ostringstream info; + info << "ArrayFire v" << AF_VERSION + << " (CUDA, " << get_system() << ", build " << AF_REVISION << ")" << std::endl; + info << getPlatformInfo(); + for (int i = 0; i < getDeviceCount(); ++i) { + info << getDeviceInfo(i); + } + return info.str(); +} + string getPlatformInfo() { string driverVersion = getDriverVersion(); diff --git a/src/backend/cuda/platform.hpp b/src/backend/cuda/platform.hpp index 6b4186b2c2..3fcc67ea5b 100644 --- a/src/backend/cuda/platform.hpp +++ b/src/backend/cuda/platform.hpp @@ -22,8 +22,7 @@ namespace cuda int getBackend(); -std::string getInfo(); - +std::string getDeviceInfo(); std::string getDeviceInfo(int device); std::string getPlatformInfo(); @@ -32,8 +31,6 @@ std::string getDriverVersion(); std::string getCUDARuntimeVersion(); -std::string getInfo(); - bool isDoubleSupported(int device); void devprop(char* d_name, char* d_platform, char *d_toolkit, char* d_compute); @@ -82,7 +79,7 @@ class DeviceManager friend std::string getCUDARuntimeVersion(); - friend std::string getInfo(); + friend std::string getDeviceInfo(); friend int getDeviceCount(); diff --git a/src/backend/opencl/platform.cpp b/src/backend/opencl/platform.cpp index c2c13c7ae3..dc8ab4ea65 100644 --- a/src/backend/opencl/platform.cpp +++ b/src/backend/opencl/platform.cpp @@ -383,7 +383,7 @@ static std::string platformMap(std::string &platStr) } } -std::string getInfo() +std::string getDeviceInfo() { ostringstream info; info << "ArrayFire v" << AF_VERSION diff --git a/src/backend/opencl/platform.hpp b/src/backend/opencl/platform.hpp index 095fdf9ae7..42579f89d1 100644 --- a/src/backend/opencl/platform.hpp +++ b/src/backend/opencl/platform.hpp @@ -21,7 +21,7 @@ namespace opencl class DeviceManager { - friend std::string getInfo(); + friend std::string getDeviceInfo(); friend int getDeviceCount(); @@ -92,7 +92,7 @@ class DeviceManager int getBackend(); -std::string getInfo(); +std::string getDeviceInfo(); int getDeviceCount(); From f0d11b30427e2199a4121a4de4819783ebde15ee Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 23 Feb 2016 17:09:54 -0500 Subject: [PATCH 281/288] Make getInfo check if af_array belongs to current device. - This behavior can be turned off optionally --- include/af/defines.h | 7 +++ src/api/c/array.cpp | 96 +++++++++++++++++++++++++++++++++++++++ src/api/c/data.cpp | 52 --------------------- src/api/c/handle.hpp | 2 + src/api/c/imageio.cpp | 1 + src/api/c/imageio2.cpp | 1 + src/api/c/print.cpp | 1 + src/backend/ArrayInfo.cpp | 20 -------- src/backend/ArrayInfo.hpp | 6 --- 9 files changed, 108 insertions(+), 78 deletions(-) create mode 100644 src/api/c/array.cpp diff --git a/include/af/defines.h b/include/af/defines.h index 2b53baabed..77508f2870 100644 --- a/include/af/defines.h +++ b/include/af/defines.h @@ -120,6 +120,13 @@ typedef enum { AF_ERR_BATCH = 207, +#if AF_API_VERSION >= 33 + /// + /// Input does not belong to the current device. + /// + AF_ERR_DEVICE = 208, +#endif + // 300-399 Errors for missing software features /// diff --git a/src/api/c/array.cpp b/src/api/c/array.cpp new file mode 100644 index 0000000000..cefdde1d75 --- /dev/null +++ b/src/api/c/array.cpp @@ -0,0 +1,96 @@ +/******************************************************* + * Copyright (c) 2016, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ +#include +#include +#include + +const ArrayInfo& +getInfo(const af_array arr, bool check) +{ + const ArrayInfo *info = static_cast(reinterpret_cast(arr)); + + if (check && info->getDevId() != detail::getActiveDeviceId()) { + AF_ERROR("Input Array not created on current device", AF_ERR_DEVICE); + } + + return *info; +} + +af_err af_get_elements(dim_t *elems, const af_array arr) +{ + try { + // Do not check for device mismatch + *elems = getInfo(arr, false).elements(); + } CATCHALL + return AF_SUCCESS; +} + +af_err af_get_type(af_dtype *type, const af_array arr) +{ + try { + // Do not check for device mismatch + *type = getInfo(arr, false).getType(); + } CATCHALL + return AF_SUCCESS; +} + +af_err af_get_dims(dim_t *d0, dim_t *d1, dim_t *d2, dim_t *d3, + const af_array in) +{ + try { + // Do not check for device mismatch + ArrayInfo info = getInfo(in, false); + *d0 = info.dims()[0]; + *d1 = info.dims()[1]; + *d2 = info.dims()[2]; + *d3 = info.dims()[3]; + } + CATCHALL + return AF_SUCCESS; +} + +af_err af_get_numdims(unsigned *nd, const af_array in) +{ + try { + // Do not check for device mismatch + ArrayInfo info = getInfo(in, false); + *nd = info.ndims(); + } + CATCHALL + return AF_SUCCESS; +} + + +#undef INSTANTIATE +#define INSTANTIATE(fn1, fn2) \ + af_err fn1(bool *result, const af_array in) \ + { \ + try { \ + ArrayInfo info = getInfo(in, false); \ + *result = info.fn2(); \ + } \ + CATCHALL \ + return AF_SUCCESS; \ + } + +INSTANTIATE(af_is_empty , isEmpty ) +INSTANTIATE(af_is_scalar , isScalar ) +INSTANTIATE(af_is_row , isRow ) +INSTANTIATE(af_is_column , isColumn ) +INSTANTIATE(af_is_vector , isVector ) +INSTANTIATE(af_is_complex , isComplex ) +INSTANTIATE(af_is_real , isReal ) +INSTANTIATE(af_is_double , isDouble ) +INSTANTIATE(af_is_single , isSingle ) +INSTANTIATE(af_is_realfloating, isRealFloating) +INSTANTIATE(af_is_floating , isFloating ) +INSTANTIATE(af_is_integer , isInteger ) +INSTANTIATE(af_is_bool , isBool ) + +#undef INSTANTIATE diff --git a/src/api/c/data.cpp b/src/api/c/data.cpp index 2de2f139e3..522eb7dfcb 100644 --- a/src/api/c/data.cpp +++ b/src/api/c/data.cpp @@ -539,58 +539,6 @@ af_err af_iota(af_array *result, const unsigned ndims, const dim_t * const dims, return AF_SUCCESS; } -#undef INSTANTIATE -#define INSTANTIATE(fn1, fn2) \ - af_err fn1(bool *result, const af_array in) \ - { \ - try { \ - ArrayInfo info = getInfo(in); \ - *result = info.fn2(); \ - } \ - CATCHALL \ - return AF_SUCCESS; \ - } - -INSTANTIATE(af_is_empty , isEmpty ) -INSTANTIATE(af_is_scalar , isScalar ) -INSTANTIATE(af_is_row , isRow ) -INSTANTIATE(af_is_column , isColumn ) -INSTANTIATE(af_is_vector , isVector ) -INSTANTIATE(af_is_complex , isComplex ) -INSTANTIATE(af_is_real , isReal ) -INSTANTIATE(af_is_double , isDouble ) -INSTANTIATE(af_is_single , isSingle ) -INSTANTIATE(af_is_realfloating, isRealFloating) -INSTANTIATE(af_is_floating , isFloating ) -INSTANTIATE(af_is_integer , isInteger ) -INSTANTIATE(af_is_bool , isBool ) - -#undef INSTANTIATE - -af_err af_get_dims(dim_t *d0, dim_t *d1, dim_t *d2, dim_t *d3, - const af_array in) -{ - try { - ArrayInfo info = getInfo(in); - *d0 = info.dims()[0]; - *d1 = info.dims()[1]; - *d2 = info.dims()[2]; - *d3 = info.dims()[3]; - } - CATCHALL - return AF_SUCCESS; -} - -af_err af_get_numdims(unsigned *nd, const af_array in) -{ - try { - ArrayInfo info = getInfo(in); - *nd = info.ndims(); - } - CATCHALL - return AF_SUCCESS; -} - template static inline void eval(af_array arr) { diff --git a/src/api/c/handle.hpp b/src/api/c/handle.hpp index 70f17eb18e..ac7b74a193 100644 --- a/src/api/c/handle.hpp +++ b/src/api/c/handle.hpp @@ -16,6 +16,8 @@ #include #include +const ArrayInfo& getInfo(const af_array arr, bool check = true); + template static const detail::Array & getArray(const af_array &arr) diff --git a/src/api/c/imageio.cpp b/src/api/c/imageio.cpp index 9f996eb64e..5e3f7a59cb 100644 --- a/src/api/c/imageio.cpp +++ b/src/api/c/imageio.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/src/api/c/imageio2.cpp b/src/api/c/imageio2.cpp index cfad2faa7b..76c53f4ab4 100644 --- a/src/api/c/imageio2.cpp +++ b/src/api/c/imageio2.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/src/api/c/print.cpp b/src/api/c/print.cpp index b243491832..66133503ef 100644 --- a/src/api/c/print.cpp +++ b/src/api/c/print.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include diff --git a/src/backend/ArrayInfo.cpp b/src/backend/ArrayInfo.cpp index 43d2627a84..a835353453 100644 --- a/src/backend/ArrayInfo.cpp +++ b/src/backend/ArrayInfo.cpp @@ -18,26 +18,6 @@ using af::dim4; -const ArrayInfo& -getInfo(af_array arr) -{ - const ArrayInfo *info = static_cast(reinterpret_cast(arr)); - return *info; -} - -af_err -af_get_elements(dim_t *elems, const af_array arr) -{ - *elems = getInfo(arr).elements(); - return AF_SUCCESS; //FIXME: Catch exceptions correctly -} - -af_err af_get_type(af_dtype *type, const af_array arr) -{ - *type = getInfo(arr).getType(); - return AF_SUCCESS; //FIXME: Catch exceptions correctly -} - dim4 calcStrides(const dim4 &parentDim) { dim4 out(1, 1, 1, 1); diff --git a/src/backend/ArrayInfo.hpp b/src/backend/ArrayInfo.hpp index 0983f06f28..88ba26b6aa 100644 --- a/src/backend/ArrayInfo.hpp +++ b/src/backend/ArrayInfo.hpp @@ -140,12 +140,6 @@ class ArrayInfo static_assert(std::is_standard_layout::value, "ArrayInfo must be a standard layout type"); #endif -// Returns size and time info for an array object. -// Note this doesn't require template parameters. -const ArrayInfo& -getInfo(const af_array arr); - - af::dim4 toDims(const std::vector& seqs, const af::dim4 &parentDims); af::dim4 toOffset(const std::vector& seqs, const af::dim4 &parentDims); From 0258883fe2a219c90456f97d86fd340c9f56940a Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 23 Feb 2016 18:19:36 -0500 Subject: [PATCH 282/288] BUGFIX: Fixing getId() from ArrayInfo - device id now occupies the last 8 bits. --- src/backend/ArrayInfo.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/backend/ArrayInfo.cpp b/src/backend/ArrayInfo.cpp index a835353453..0937641afc 100644 --- a/src/backend/ArrayInfo.cpp +++ b/src/backend/ArrayInfo.cpp @@ -35,33 +35,33 @@ int ArrayInfo::getDevId() const { // The actual device ID is only stored in the first 4 bits of devId // See ArrayInfo.hpp for more - return devId & 0xf; + return devId & 0xff; } void ArrayInfo::setId(int id) const { - // 1 << (backendId + 3) sets the 4th, 5th or 6th bit of devId to 1 + // 1 << (backendId + 8) sets the 9th, 10th or 11th bit of devId to 1 // for CPU, CUDA and OpenCL respectively // See ArrayInfo.hpp for more int backendId = detail::getBackend() >> 1; // Convert enums 1, 2, 4 to ints 0, 1, 2 - const_cast(this)->setId(id | 1 << (backendId + 3)); + const_cast(this)->setId(id | 1 << (backendId + 8)); } void ArrayInfo::setId(int id) { - // 1 << (backendId + 3) sets the 4th, 5th or 6th bit of devId to 1 + // 1 << (backendId + 3) sets the 9th, 10th or 11th bit of devId to 1 // for CPU, CUDA and OpenCL respectively // See ArrayInfo.hpp for more int backendId = detail::getBackend() >> 1; // Convert enums 1, 2, 4 to ints 0, 1, 2 - devId = id | 1 << (backendId + 3); + devId = id | 1 << (backendId + 8); } af_backend ArrayInfo::getBackendId() const { - // devId >> 3 converts the backend info to 1, 2, 4 which are enums + // devId >> 8 converts the backend info to 1, 2, 4 which are enums // for CPU, CUDA and OpenCL respectively // See ArrayInfo.hpp for more - int backendId = devId >> 3; + int backendId = devId >> 8; return (af_backend)backendId; } From c38cc2d989fae401daa6cd6a7621f4e6527844b0 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Wed, 24 Feb 2016 15:16:47 -0500 Subject: [PATCH 283/288] BUGFIX: Ensure set operations work on vectors only --- src/api/c/set.cpp | 24 +++++++++++++++++++----- src/backend/cuda/set.cu | 14 +++++++------- src/backend/opencl/set.cpp | 14 +++++++------- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/src/api/c/set.cpp b/src/api/c/set.cpp index 1643fad95b..db9b5782e5 100644 --- a/src/api/c/set.cpp +++ b/src/api/c/set.cpp @@ -28,7 +28,9 @@ af_err af_set_unique(af_array *out, const af_array in, const bool is_sorted) { try { - af_dtype type = getInfo(in).getType(); + ArrayInfo in_info = getInfo(in); + ARG_ASSERT(1, in_info.isVector()); + af_dtype type = in_info.getType(); af_array res; switch(type) { @@ -62,8 +64,14 @@ af_err af_set_union(af_array *out, const af_array first, const af_array second, { try { - af_dtype first_type = getInfo(first).getType(); - af_dtype second_type = getInfo(second).getType(); + ArrayInfo first_info = getInfo(first); + ArrayInfo second_info = getInfo(second); + + ARG_ASSERT(1, first_info.isVector()); + ARG_ASSERT(1, second_info.isVector()); + + af_dtype first_type = first_info.getType(); + af_dtype second_type = second_info.getType(); ARG_ASSERT(1, first_type == second_type); @@ -98,8 +106,14 @@ af_err af_set_intersect(af_array *out, const af_array first, const af_array seco { try { - af_dtype first_type = getInfo(first).getType(); - af_dtype second_type = getInfo(second).getType(); + ArrayInfo first_info = getInfo(first); + ArrayInfo second_info = getInfo(second); + + ARG_ASSERT(1, first_info.isVector()); + ARG_ASSERT(1, second_info.isVector()); + + af_dtype first_type = first_info.getType(); + af_dtype second_type = second_info.getType(); ARG_ASSERT(1, first_type == second_type); diff --git a/src/backend/cuda/set.cu b/src/backend/cuda/set.cu index 63501d3f2a..4629b8b3dc 100644 --- a/src/backend/cuda/set.cu +++ b/src/backend/cuda/set.cu @@ -32,7 +32,7 @@ namespace cuda Array out = copyArray(in); thrust::device_ptr out_ptr = thrust::device_pointer_cast(out.get()); - thrust::device_ptr out_ptr_end = out_ptr + out.dims()[0]; + thrust::device_ptr out_ptr_end = out_ptr + out.elements(); if(!is_sorted) THRUST_SELECT(thrust::sort, out_ptr, out_ptr_end); thrust::device_ptr out_ptr_last; @@ -55,14 +55,14 @@ namespace cuda unique_second = setUnique(second, false); } - dim_t out_size = unique_first.dims()[0] + unique_second.dims()[0]; + dim_t out_size = unique_first.elements() + unique_second.elements(); Array out = createEmptyArray(dim4(out_size)); thrust::device_ptr first_ptr = thrust::device_pointer_cast(unique_first.get()); - thrust::device_ptr first_ptr_end = first_ptr + unique_first.dims()[0]; + thrust::device_ptr first_ptr_end = first_ptr + unique_first.elements(); thrust::device_ptr second_ptr = thrust::device_pointer_cast(unique_second.get()); - thrust::device_ptr second_ptr_end = second_ptr + unique_second.dims()[0]; + thrust::device_ptr second_ptr_end = second_ptr + unique_second.elements(); thrust::device_ptr out_ptr = thrust::device_pointer_cast(out.get()); @@ -87,14 +87,14 @@ namespace cuda unique_second = setUnique(second, false); } - dim_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]); + dim_t out_size = std::max(unique_first.elements(), unique_second.elements()); Array out = createEmptyArray(dim4(out_size)); thrust::device_ptr first_ptr = thrust::device_pointer_cast(unique_first.get()); - thrust::device_ptr first_ptr_end = first_ptr + unique_first.dims()[0]; + thrust::device_ptr first_ptr_end = first_ptr + unique_first.elements(); thrust::device_ptr second_ptr = thrust::device_pointer_cast(unique_second.get()); - thrust::device_ptr second_ptr_end = second_ptr + unique_second.dims()[0]; + thrust::device_ptr second_ptr_end = second_ptr + unique_second.elements(); thrust::device_ptr out_ptr = thrust::device_pointer_cast(out.get()); diff --git a/src/backend/opencl/set.cpp b/src/backend/opencl/set.cpp index 5604ff4ad9..c37b7c4c4e 100644 --- a/src/backend/opencl/set.cpp +++ b/src/backend/opencl/set.cpp @@ -53,7 +53,7 @@ namespace opencl compute::buffer out_data((*out.get())()); compute::buffer_iterator< type_t > begin(out_data, 0); - compute::buffer_iterator< type_t > end(out_data, out.dims()[0]); + compute::buffer_iterator< type_t > end(out_data, out.elements()); if (!is_sorted) { compute::sort(begin, end, queue); @@ -83,7 +83,7 @@ namespace opencl unique_second = setUnique(second, false); } - size_t out_size = unique_first.dims()[0] + unique_second.dims()[0]; + size_t out_size = unique_first.elements() + unique_second.elements(); Array out = createEmptyArray(dim4(out_size, 1, 1, 1)); compute::command_queue queue(getQueue()()); @@ -93,9 +93,9 @@ namespace opencl compute::buffer out_data((*out.get())()); compute::buffer_iterator< type_t > first_begin(first_data, 0); - compute::buffer_iterator< type_t > first_end(first_data, unique_first.dims()[0]); + compute::buffer_iterator< type_t > first_end(first_data, unique_first.elements()); compute::buffer_iterator< type_t > second_begin(second_data, 0); - compute::buffer_iterator< type_t > second_end(second_data, unique_second.dims()[0]); + compute::buffer_iterator< type_t > second_end(second_data, unique_second.elements()); compute::buffer_iterator< type_t > out_begin(out_data, 0); compute::buffer_iterator< type_t > out_end = compute::set_union( @@ -124,7 +124,7 @@ namespace opencl unique_second = setUnique(second, false); } - size_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]); + size_t out_size = std::max(unique_first.elements(), unique_second.elements()); Array out = createEmptyArray(dim4(out_size, 1, 1, 1)); compute::command_queue queue(getQueue()()); @@ -134,9 +134,9 @@ namespace opencl compute::buffer out_data((*out.get())()); compute::buffer_iterator< type_t > first_begin(first_data, 0); - compute::buffer_iterator< type_t > first_end(first_data, unique_first.dims()[0]); + compute::buffer_iterator< type_t > first_end(first_data, unique_first.elements()); compute::buffer_iterator< type_t > second_begin(second_data, 0); - compute::buffer_iterator< type_t > second_end(second_data, unique_second.dims()[0]); + compute::buffer_iterator< type_t > second_end(second_data, unique_second.elements()); compute::buffer_iterator< type_t > out_begin(out_data, 0); compute::buffer_iterator< type_t > out_end = compute::set_intersection( From 483163123ddbb0c60e6456148fbb8ef05f91753a Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Wed, 24 Feb 2016 15:19:49 -0500 Subject: [PATCH 284/288] DOCS: Fixing documentation for exp --- docs/details/arith.dox | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/details/arith.dox b/docs/details/arith.dox index 50f82aafed..a75c3a2cc4 100644 --- a/docs/details/arith.dox +++ b/docs/details/arith.dox @@ -448,8 +448,6 @@ Raise an array to a power Exponential of input -\copydoc arith_real_only - \defgroup arith_func_expm1 expm1 From 9b793f00927d3253fbfe28840171e34b749ebdf7 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 23 Feb 2016 18:20:35 -0500 Subject: [PATCH 285/288] FEAT,TEST,DOC: Adding function to query which device an array was created. - Adding relevant tests and docs --- docs/details/backend.dox | 9 +++++++++ include/af/backend.h | 24 ++++++++++++++++++++++++ src/api/c/device.cpp | 12 +++++++++++- src/api/cpp/device.cpp | 7 +++++++ src/api/unified/device.cpp | 6 ++++++ test/array.cpp | 31 +++++++++++++++++++++++++++++++ 6 files changed, 88 insertions(+), 1 deletion(-) diff --git a/docs/details/backend.dox b/docs/details/backend.dox index 146cc14313..893567b696 100644 --- a/docs/details/backend.dox +++ b/docs/details/backend.dox @@ -80,5 +80,14 @@ The return value specifies which backend the array was created on. ======================================================================= +\defgroup unified_func_getdeviceid getDeviceId + +\brief Get's the id of the device an array was created on. + +\ingroup unified_func +\ingroup arrayfire_func + +======================================================================= + @} */ diff --git a/include/af/backend.h b/include/af/backend.h index 0342ef0ade..0770feb5b1 100644 --- a/include/af/backend.h +++ b/include/af/backend.h @@ -66,6 +66,18 @@ AFAPI af_err af_get_backend_id(af_backend *backend, const af_array in); AFAPI af_err af_get_active_backend(af_backend *backend); #endif +#if AF_API_VERSION >= 33 +/** + \param[out] dev contains the device on which \p in was created. + \param[in] in is the array who's device is to be queried. + \returns \ref af_err error code + + \ingroup unified_func_getdeviceid + */ +AFAPI af_err af_get_device_id(int *device, const af_array in); +#endif + + #ifdef __cplusplus } #endif @@ -121,5 +133,17 @@ AFAPI af::Backend getBackendId(const array &in); AFAPI af::Backend getActiveBackend(); #endif +#if AF_API_VERSION >= 33 +/** + \param[in] in is the array who's device is to be queried. + \returns The id of the device on which this array was created. + + \note Device ID can be the same for arrays belonging to different backends. + + \ingroup unified_func_getdeviceid + */ +AFAPI int getDeviceId(const array &in); +#endif + } #endif diff --git a/src/api/c/device.cpp b/src/api/c/device.cpp index 6c089f57c0..abe0b01e32 100644 --- a/src/api/c/device.cpp +++ b/src/api/c/device.cpp @@ -48,12 +48,22 @@ af_err af_get_backend_id(af_backend *result, const af_array in) { try { ARG_ASSERT(1, in != 0); - ArrayInfo info = getInfo(in); + ArrayInfo info = getInfo(in, false); *result = info.getBackendId(); } CATCHALL; return AF_SUCCESS; } +af_err af_get_device_id(int *device, const af_array in) +{ + try { + ARG_ASSERT(1, in != 0); + ArrayInfo info = getInfo(in, false); + *device = info.getDevId(); + } CATCHALL; + return AF_SUCCESS; +} + af_err af_get_active_backend(af_backend *result) { *result = (af_backend)getBackend(); diff --git a/src/api/cpp/device.cpp b/src/api/cpp/device.cpp index 5e4b0f7bf0..faf0b0e7dd 100644 --- a/src/api/cpp/device.cpp +++ b/src/api/cpp/device.cpp @@ -42,6 +42,13 @@ namespace af return result; } + int getDeviceId(const array &in) + { + int device = getDevice();; + AF_THROW(af_get_device_id(&device, in.get())); + return device; + } + af::Backend getActiveBackend() { af::Backend result = (af::Backend)0; diff --git a/src/api/unified/device.cpp b/src/api/unified/device.cpp index fbd8e32f90..ed8e6a37f6 100644 --- a/src/api/unified/device.cpp +++ b/src/api/unified/device.cpp @@ -35,6 +35,12 @@ af_err af_get_backend_id(af_backend *result, const af_array in) return CALL(result, in); } +af_err af_get_device_id(int *device, const af_array in) +{ + CHECK_ARRAYS(in); + return CALL(device, in); +} + af_err af_get_active_backend(af_backend *result) { *result = unified::AFSymbolManager::getInstance().getActiveBackend(); diff --git a/test/array.cpp b/test/array.cpp index 6c1f511410..293b888a8f 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -454,3 +454,34 @@ TEST(Device, unequal) ASSERT_EQ(ptr, b.device()); } } + +TEST(DeviceId, Same) +{ + array a = randu(5,5); + ASSERT_EQ(getDevice(), getDeviceId(a)); +} + +TEST(DeviceId, Different) +{ + int ndevices = getDeviceCount(); + if (ndevices < 2) return; + + int id0 = getDevice(); + int id1 = (id0 + 1) % ndevices; + + array a = randu(5,5); + ASSERT_EQ(getDeviceId(a), id0); + setDevice(id1); + + array b = randu(5,5); + + ASSERT_EQ(getDeviceId(a), id0); + ASSERT_EQ(getDeviceId(b), id1); + ASSERT_NE(getDevice(), getDeviceId(a)); + ASSERT_EQ(getDevice(), getDeviceId(b)); + + af_array c; + af_err err = af_matmul(&c, a.get(), b.get(), AF_MAT_NONE, AF_MAT_NONE); + ASSERT_EQ(err, AF_ERR_DEVICE); + setDevice(id0); +} From e83fcafacc03b40594bee736b736a0b76f5fa24a Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 25 Feb 2016 11:19:40 -0500 Subject: [PATCH 286/288] Added release notes --- docs/pages/release_notes.md | 56 +++++++++++++++++++++++++++++++++++-- include/af/backend.h | 2 +- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/docs/pages/release_notes.md b/docs/pages/release_notes.md index 1063b054e3..738d2b0a4f 100644 --- a/docs/pages/release_notes.md +++ b/docs/pages/release_notes.md @@ -21,14 +21,34 @@ Features * [Scatter plot](https://github.com/arrayfire/arrayfire/pull/1116) added to graphics. * \ref af::transform() now supports perspective transformation matrices. * \ref af::infoString(): Returns `af::info()` as a string. +* \ref af::printMemInfo(): Print a table showing information about buffer from the memory manager + * The \ref AF_MEM_INFO macro prints numbers and total sizes of all buffers (requires including af/macros.h) * \ref af::allocHost(): Allocates memory on host. * \ref af::freeHost(): Frees host side memory allocated by arrayfire. -* Functions specific to OpenCl backend. +* OpenCL functions can now use CPU implementation. + * Currently limited to Unified Memory devices (CPU and On-board Graphics). + * Functions: af::matmul() and all [LAPACK](\ref linalg_mat) functions. + * Takes advantage of optimized libraries such as MKL without doing memory copies. + * Use the environment variable `AF_OPENCL_CPU_OFFLOAD=1` to take advantage of this feature. +* Functions specific to OpenCL backend. * \ref afcl::addDevice(): Adds an external device and context to ArrayFire's device manager. * \ref afcl::deleteDevice(): Removes an external device and context from ArrayFire's device manager. * \ref afcl::setDevice(): Sets an external device and context from ArrayFire's device manager. * \ref afcl::getDeviceType(): Gets the device type of the current device. * \ref afcl::getPlatform(): Gets the platform of the current device. +* \ref af::createStridedArray() allows [array creation user-defined strides](https://github.com/arrayfire/arrayfire/issues/1177) and device pointer. +* [Expose functions](https://github.com/arrayfire/arrayfire/issues/1131) that provide information + about memory layout of Arrays. + * \ref af::getStrides(): Gets the strides for each dimension of the array. + * \ref af::getOffset(): Gets the offsets for each dimension of the array. + * \ref af::getRawPtr(): Gets raw pointer to the location of the array on device. + * \ref af::isLinear(): Returns true if all elements in the array are contiguous. + * \ref af::isOwner(): Returns true if the array owns the raw pointer, false if it is a sub-array. + * \ref af::getStrides(): Gets the strides of the array. + * \ref af::getStrides(): Gets the strides of the array. +* \ref af::getDeviceId(): Gets the device id on which the array resides. +* \ref af::isImageIOAvailable(): Returns true if ArrayFire was compiled with Freeimage enabled +* \ref af::isLAPACKAvailable(): Returns true if ArrayFire was compiled with LAPACK functions enabled Bug Fixes -------------- @@ -38,6 +58,16 @@ Bug Fixes * Fixed [imageio bugs](https://github.com/arrayfire/arrayfire/pull/1229) for 16 bit images. * Fixed [bugs when loading and storing images](https://github.com/arrayfire/arrayfire/pull/1228) natively. * Fixed [bug in FFT for NVIDIA GPUs](https://github.com/arrayfire/arrayfire/issues/615) when using OpenCL backend. +* Fixed [bug when using external context](https://github.com/arrayfire/arrayfire/pull/1241) with OpenCL backend. +* Fixed [memory leak](https://github.com/arrayfire/arrayfire/issues/1269) in \ref af_median_all(). +* Fixed [memory leaks and performance](https://github.com/arrayfire/arrayfire/pull/1274) in graphics functions. +* Fixed [bugs when indexing followed by moddims](https://github.com/arrayfire/arrayfire/issues/1275). +* \ref af_get_revision() now returns actual commit rather than AF_REVISION. +* Fixed [releasing arrays](https://github.com/arrayfire/arrayfire/issues/1282) when using different backends. +* OS X OpenCL: [LAPACK functions](\ref linalg_mat) on CPU devices use OpenCL offload (previously threw errors). +* [Add support for 32-bit integer image types](https://github.com/arrayfire/arrayfire/pull/1287) in Image IO. +* Fixed [set operations for row vectors](https://github.com/arrayfire/arrayfire/issues/1300) +* Fixed [bugs](https://github.com/arrayfire/arrayfire/issues/1243) in \ref af::meanShift() and af::orb(). Improvements -------------- @@ -46,6 +76,10 @@ Improvements * Performance improvements to the memory manager. * Error messages are now more detailed. * Improved sorted order for OpenCL devices. +* JIT heuristics can now be tweaked using environment variables. See + [Environment Variables](\ref configuring_environment) tutorial. +* Add `BUILD_` [options to examples and tests](https://github.com/arrayfire/arrayfire/issues/1286) + to toggle backends when compiling independently. Examples ---------- @@ -57,6 +91,17 @@ Build * Support for Intel `icc` compiler * Support to compile with Intel MKL as a BLAS and LAPACK provider +* Tests are now available for building as standalone (like examples) +* Tests can now be built as a single file for each backend +* Better handling of NONFREE build options +* [Searching for GLEW in CMake default paths](https://github.com/arrayfire/arrayfire/pull/1292) +* Fixes for compiling with MKL on OSX. + +Installers +---------- +* Improvements to OSX Installer + * CMake config files are now installed with libraries + * Independent options for installing examples and documentation components Deprecations ----------- @@ -67,8 +112,15 @@ Deprecations Documentation -------------- -* Fixes to documentation for matchTemplate. +* Fixes to documentation for \ref matchTemplate(). * Improved documentation for deviceInfo. +* Fixes to documentation for \ref exp(). + +Known Issues +------------ + +* [Solve OpenCL fails on NVIDIA Maxwell devices](https://github.com/arrayfire/arrayfire/issues/1246) + for f32 and c32 when M > N and K % 4 is 1 or 2. v3.2.2 diff --git a/include/af/backend.h b/include/af/backend.h index 0770feb5b1..94c4951d45 100644 --- a/include/af/backend.h +++ b/include/af/backend.h @@ -68,7 +68,7 @@ AFAPI af_err af_get_active_backend(af_backend *backend); #if AF_API_VERSION >= 33 /** - \param[out] dev contains the device on which \p in was created. + \param[out] device contains the device on which \p in was created. \param[in] in is the array who's device is to be queried. \returns \ref af_err error code From 58809cb68eed2b2896ffd57b969ac87a2433bd30 Mon Sep 17 00:00:00 2001 From: pradeep Date: Thu, 25 Feb 2016 23:36:05 +0530 Subject: [PATCH 287/288] Support to set visibility of windows programmatically --- include/af/graphics.h | 23 +++++++++++++++++++++++ src/api/c/image.cpp | 22 ++++++++++++++++++++++ src/api/cpp/graphics.cpp | 5 +++++ src/api/unified/graphics.cpp | 5 +++++ 4 files changed, 55 insertions(+) diff --git a/include/af/graphics.h b/include/af/graphics.h index 7485686479..b69a83854a 100644 --- a/include/af/graphics.h +++ b/include/af/graphics.h @@ -289,6 +289,17 @@ class AFAPI Window { */ bool close(); +#if AF_API_VERSION >= 33 + /** + Hide/Show the window + + \param[in] isVisible indicates if the window is to be hidden or brought into focus + + \ingroup gfx_func_window + */ + void setVisibility(const bool isVisible); +#endif + /** This function is used to keep track of which cell in the grid mode is being currently rendered. When a user does Window(0,0), we internally @@ -547,6 +558,18 @@ AFAPI af_err af_show(const af_window wind); */ AFAPI af_err af_is_window_closed(bool *out, const af_window wind); +#if AF_API_VERSION >= 33 +/** + Hide/Show a window + + \param[in] wind is the window whose visibility is to be changed + \param[in] is_visible indicates if the window is to be hidden or brought into focus + + \ingroup gfx_func_window + */ +AFAPI af_err af_set_visibility(const af_window wind, const bool is_visible); +#endif + /** C Interface wrapper for destroying a window handle diff --git a/src/api/c/image.cpp b/src/api/c/image.cpp index db40934e50..2c523d0947 100644 --- a/src/api/c/image.cpp +++ b/src/api/c/image.cpp @@ -264,6 +264,28 @@ af_err af_is_window_closed(bool *out, const af_window wind) #endif } +af_err af_set_visibility(const af_window wind, const bool is_visible) +{ +#if defined(WITH_GRAPHICS) + if(wind==0) { + std::cerr<<"Not a valid window"<(wind); + if (is_visible) + wnd->show(); + else + wnd->hide(); + } + CATCHALL; + return AF_SUCCESS; +#else + AF_RETURN_ERROR("ArrayFire compiled without graphics support", AF_ERR_NO_GFX); +#endif +} + af_err af_destroy_window(const af_window wind) { #if defined(WITH_GRAPHICS) diff --git a/src/api/cpp/graphics.cpp b/src/api/cpp/graphics.cpp index 162bacb4ab..8b53825c25 100644 --- a/src/api/cpp/graphics.cpp +++ b/src/api/cpp/graphics.cpp @@ -136,4 +136,9 @@ bool Window::close() return temp; } +void Window::setVisibility(const bool isVisible) +{ + AF_THROW(af_set_visibility(get(), isVisible)); +} + } diff --git a/src/api/unified/graphics.cpp b/src/api/unified/graphics.cpp index 2895cc7afc..9e3f1c8b38 100644 --- a/src/api/unified/graphics.cpp +++ b/src/api/unified/graphics.cpp @@ -89,6 +89,11 @@ af_err af_is_window_closed(bool *out, const af_window wind) return CALL(out, wind); } +af_err af_set_visibility(const af_window wind, const bool is_visible) +{ + return CALL(wind, is_visible); +} + af_err af_destroy_window(const af_window wind) { return CALL(wind); From 5a2267461cff78c9383d3e10d93e71cdbaa0d1d8 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 25 Feb 2016 15:55:08 -0500 Subject: [PATCH 288/288] DOC Typo corrections in Installation page --- docs/pages/INSTALL.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/pages/INSTALL.md b/docs/pages/INSTALL.md index 3565889571..d31affaefe 100644 --- a/docs/pages/INSTALL.md +++ b/docs/pages/INSTALL.md @@ -108,13 +108,14 @@ First install the prerequisite packages: # Prerequisite packages: sudo apt-get install libfreeimage-dev libatlas3gf-base libfftw3-dev cmake -Ubuntu 14.04 will not have the libglfw3-dev package in its repositories. You can either build the library from source (following the instructions listed) or install the library from a PPA as follows: - -``` -sudo apt-add repository ppa:keithw/glfw3 -sudo apt-get update -sudo apt-get install glfw3 -``` +Ubuntu 14.04 will not have the libglfw3-dev package in its repositories. You can either build the +library from source (following the +[instructions listed here](https://github.com/arrayfire/arrayfire/wiki/GLFW-for-ArrayFire)) or +install the library from a PPA as follows: + + sudo apt-add-repository ppa:keithw/glfw3 + sudo apt-get update + sudo apt-get install glfw3 After this point, the installation should proceed identically to Ubuntu 14.10 or newer.