diff --git a/src/backend/cuda/kernel/fftconvolve.hpp b/src/backend/cuda/kernel/fftconvolve.hpp index 2acf2db621..186fb117de 100644 --- a/src/backend/cuda/kernel/fftconvolve.hpp +++ b/src/backend/cuda/kernel/fftconvolve.hpp @@ -262,8 +262,13 @@ void packDataHelper(Param sig_packed, { dim_t *sd = sig.dims; - int sig_packed_elem = sig_packed.strides[3] * sig_packed.dims[3]; - int filter_packed_elem = filter_packed.strides[3] * filter_packed.dims[3]; + int sig_packed_elem = 1; + int filter_packed_elem = 1; + + for (int i = 0; i < 4; i++) { + sig_packed_elem *= sig_packed.dims[i]; + filter_packed_elem *= filter_packed.dims[i]; + } // Number of packed complex elements in dimension 0 int sig_half_d0 = divup(sd[0], 2); @@ -292,8 +297,13 @@ void complexMultiplyHelper(Param out, CParam filter, ConvolveBatchKind kind) { - int sig_packed_elem = sig_packed.strides[3] * sig_packed.dims[3]; - int filter_packed_elem = filter_packed.strides[3] * filter_packed.dims[3]; + int sig_packed_elem = 1; + int filter_packed_elem = 1; + + for (int i = 0; i < 4; i++) { + sig_packed_elem *= sig_packed.dims[i]; + filter_packed_elem *= filter_packed.dims[i]; + } dim3 threads(THREADS); dim3 blocks(divup(sig_packed_elem / 2, threads.x)); diff --git a/src/backend/cuda/kernel/ireduce.hpp b/src/backend/cuda/kernel/ireduce.hpp index 4f9cac5baf..4a2ab14777 100644 --- a/src/backend/cuda/kernel/ireduce.hpp +++ b/src/backend/cuda/kernel/ireduce.hpp @@ -444,7 +444,7 @@ namespace kernel template T ireduce_all(uint *idx, CParam in) { - int in_elements = in.strides[3] * in.dims[3]; + int in_elements = in.dims[0] * in.dims[1] * in.dims[2] * in.dims[3]; // FIXME: Use better heuristics to get to the optimum number if (in_elements > 4096) { diff --git a/src/backend/cuda/kernel/reduce.hpp b/src/backend/cuda/kernel/reduce.hpp index be52375525..8cc720fb69 100644 --- a/src/backend/cuda/kernel/reduce.hpp +++ b/src/backend/cuda/kernel/reduce.hpp @@ -371,7 +371,7 @@ namespace kernel template To reduce_all(CParam in, bool change_nan, double nanval) { - int in_elements = in.strides[3] * in.dims[3]; + int in_elements = in.dims[0] * in.dims[1] * in.dims[2] * in.dims[3]; // FIXME: Use better heuristics to get to the optimum number if (in_elements > 4096) { diff --git a/src/backend/opencl/kernel/ireduce.hpp b/src/backend/opencl/kernel/ireduce.hpp index 5fa0fa6952..8bc06f2f5f 100644 --- a/src/backend/opencl/kernel/ireduce.hpp +++ b/src/backend/opencl/kernel/ireduce.hpp @@ -332,7 +332,7 @@ namespace kernel T ireduce_all(uint *loc, Param in) { try { - int in_elements = in.info.dims[3] * in.info.strides[3]; + int in_elements = in.info.dims[0] * in.info.dims[1] * in.info.dims[2] * in.info.dims[3]; // FIXME: Use better heuristics to get to the optimum number if (in_elements > 4096) { @@ -397,7 +397,9 @@ namespace kernel unique_ptr h_ptr(new T[in_elements]); T* h_ptr_raw = h_ptr.get(); - getQueue().enqueueReadBuffer(*in.data, CL_TRUE, 0, sizeof(T) * in_elements, h_ptr_raw); + + getQueue().enqueueReadBuffer(*in.data, CL_TRUE, sizeof(T) * in.info.offset, + sizeof(T) * in_elements, h_ptr_raw); MinMaxOp Op(h_ptr_raw[0], 0); diff --git a/src/backend/opencl/kernel/reduce.hpp b/src/backend/opencl/kernel/reduce.hpp index 5ac55b58da..e46d5e41ab 100644 --- a/src/backend/opencl/kernel/reduce.hpp +++ b/src/backend/opencl/kernel/reduce.hpp @@ -287,7 +287,7 @@ namespace kernel To reduce_all(Param in, int change_nan, double nanval) { try { - int in_elements = in.info.dims[3] * in.info.strides[3]; + int in_elements = in.info.dims[0] * in.info.dims[1] * in.info.dims[2] * in.info.dims[3]; // FIXME: Use better heuristics to get to the optimum number if (in_elements > 4096) { @@ -342,7 +342,8 @@ namespace kernel } else { unique_ptr h_ptr(new Ti[in_elements]); - getQueue().enqueueReadBuffer(*in.data, CL_TRUE, 0, sizeof(Ti) * in_elements, h_ptr.get()); + getQueue().enqueueReadBuffer(*in.data, CL_TRUE, sizeof(Ti) * in.info.offset, + sizeof(Ti) * in_elements, h_ptr.get()); Transform transform; Binary reduce; diff --git a/test/ireduce.cpp b/test/ireduce.cpp index 18461c5ee5..17abfcf8e4 100644 --- a/test/ireduce.cpp +++ b/test/ireduce.cpp @@ -12,6 +12,7 @@ #include #include #include +#include using namespace std; using namespace af; @@ -98,3 +99,47 @@ MINMAXOP(max, int) MINMAXOP(max, uint) MINMAXOP(max, char) MINMAXOP(max, uchar) + +TEST(ImaxAll, IndexedSmall) +{ + const int num = 1000; + const int st = 10; + const int en = num - 100; + af::array a = af::randu(num); + + float b; + unsigned idx; + af::max(&b, &idx, a(af::seq(st, en))); + + std::vector ha(num); + a.host(&ha[0]); + + float res = ha[st]; + for (int i = st; i <= en; i++) { + res = std::max(res, ha[i]); + } + + ASSERT_EQ(b, res); +} + +TEST(ImaxAll, IndexedBig) +{ + const int num = 100000; + const int st = 1000; + const int en = num - 1000; + af::array a = af::randu(num); + + float b; + unsigned idx; + af::max(&b, &idx, a(af::seq(st, en))); + + std::vector ha(num); + a.host(&ha[0]); + + float res = ha[st]; + for (int i = st; i <= en; i++) { + res = std::max(res, ha[i]); + } + + ASSERT_EQ(b, res); +} diff --git a/test/reduce.cpp b/test/reduce.cpp index ad99430e17..000f1ea961 100644 --- a/test/reduce.cpp +++ b/test/reduce.cpp @@ -15,6 +15,7 @@ #include #include #include +#include using std::vector; using std::string; @@ -522,3 +523,41 @@ TEST(AnyAll, NaN) ASSERT_EQ(af::anyTrue(A), true); ASSERT_EQ(af::allTrue(A), false); } + +TEST(MaxAll, IndexedSmall) +{ + const int num = 1000; + const int st = 10; + const int en = num - 100; + af::array a = af::randu(num); + float b = af::max(a(af::seq(st, en))); + + std::vector ha(num); + a.host(&ha[0]); + + float res = ha[st]; + for (int i = st; i <= en; i++) { + res = std::max(res, ha[i]); + } + + ASSERT_EQ(b, res); +} + +TEST(MaxAll, IndexedBig) +{ + const int num = 100000; + const int st = 1000; + const int en = num - 1000; + af::array a = af::randu(num); + float b = af::max(a(af::seq(st, en))); + + std::vector ha(num); + a.host(&ha[0]); + + float res = ha[st]; + for (int i = st; i <= en; i++) { + res = std::max(res, ha[i]); + } + + ASSERT_EQ(b, res); +}