diff --git a/src/backend/cuda/kernel/fftconvolve.hpp b/src/backend/cuda/kernel/fftconvolve.hpp
index 2acf2db621..186fb117de 100644
--- a/src/backend/cuda/kernel/fftconvolve.hpp
+++ b/src/backend/cuda/kernel/fftconvolve.hpp
@@ -262,8 +262,13 @@ void packDataHelper(Param<convT> sig_packed,
 {
     dim_t *sd = sig.dims;
 
-    int sig_packed_elem = sig_packed.strides[3] * sig_packed.dims[3];
-    int filter_packed_elem = filter_packed.strides[3] * filter_packed.dims[3];
+    int sig_packed_elem = 1;
+    int filter_packed_elem = 1;
+
+    for (int i = 0; i < 4; i++) {
+        sig_packed_elem *= sig_packed.dims[i];
+        filter_packed_elem *= filter_packed.dims[i];
+    }
 
     // Number of packed complex elements in dimension 0
     int sig_half_d0 = divup(sd[0], 2);
@@ -292,8 +297,13 @@ void complexMultiplyHelper(Param<T> out,
                            CParam<T> filter,
                            ConvolveBatchKind kind)
 {
-    int sig_packed_elem = sig_packed.strides[3] * sig_packed.dims[3];
-    int filter_packed_elem = filter_packed.strides[3] * filter_packed.dims[3];
+    int sig_packed_elem = 1;
+    int filter_packed_elem = 1;
+
+    for (int i = 0; i < 4; i++) {
+        sig_packed_elem *= sig_packed.dims[i];
+        filter_packed_elem *= filter_packed.dims[i];
+    }
 
     dim3 threads(THREADS);
     dim3 blocks(divup(sig_packed_elem / 2, threads.x));
diff --git a/src/backend/cuda/kernel/ireduce.hpp b/src/backend/cuda/kernel/ireduce.hpp
index 4f9cac5baf..4a2ab14777 100644
--- a/src/backend/cuda/kernel/ireduce.hpp
+++ b/src/backend/cuda/kernel/ireduce.hpp
@@ -444,7 +444,7 @@ namespace kernel
     template<typename T, af_op_t op>
     T ireduce_all(uint *idx, CParam<T> in)
     {
-        int in_elements = in.strides[3] * in.dims[3];
+        int in_elements = in.dims[0] * in.dims[1] * in.dims[2] * in.dims[3];
 
         // FIXME: Use better heuristics to get to the optimum number
         if (in_elements > 4096) {
diff --git a/src/backend/cuda/kernel/reduce.hpp b/src/backend/cuda/kernel/reduce.hpp
index be52375525..8cc720fb69 100644
--- a/src/backend/cuda/kernel/reduce.hpp
+++ b/src/backend/cuda/kernel/reduce.hpp
@@ -371,7 +371,7 @@ namespace kernel
     template<typename Ti, typename To, af_op_t op>
     To reduce_all(CParam<Ti> in, bool change_nan, double nanval)
     {
-        int in_elements = in.strides[3] * in.dims[3];
+        int in_elements = in.dims[0] * in.dims[1] * in.dims[2] * in.dims[3];
 
         // FIXME: Use better heuristics to get to the optimum number
         if (in_elements > 4096) {
diff --git a/src/backend/opencl/kernel/ireduce.hpp b/src/backend/opencl/kernel/ireduce.hpp
index 5fa0fa6952..8bc06f2f5f 100644
--- a/src/backend/opencl/kernel/ireduce.hpp
+++ b/src/backend/opencl/kernel/ireduce.hpp
@@ -332,7 +332,7 @@ namespace kernel
     T ireduce_all(uint *loc, Param in)
     {
         try {
-            int in_elements = in.info.dims[3] * in.info.strides[3];
+            int in_elements = in.info.dims[0] * in.info.dims[1] * in.info.dims[2] * in.info.dims[3];
 
             // FIXME: Use better heuristics to get to the optimum number
             if (in_elements > 4096) {
@@ -397,7 +397,9 @@ namespace kernel
 
                 unique_ptr<T> h_ptr(new T[in_elements]);
                 T* h_ptr_raw = h_ptr.get();
-                getQueue().enqueueReadBuffer(*in.data, CL_TRUE, 0, sizeof(T) * in_elements, h_ptr_raw);
+
+                getQueue().enqueueReadBuffer(*in.data, CL_TRUE, sizeof(T) * in.info.offset,
+                                             sizeof(T) * in_elements, h_ptr_raw);
 
 
                 MinMaxOp<op, T> Op(h_ptr_raw[0], 0);
diff --git a/src/backend/opencl/kernel/reduce.hpp b/src/backend/opencl/kernel/reduce.hpp
index 5ac55b58da..e46d5e41ab 100644
--- a/src/backend/opencl/kernel/reduce.hpp
+++ b/src/backend/opencl/kernel/reduce.hpp
@@ -287,7 +287,7 @@ namespace kernel
     To reduce_all(Param in, int change_nan, double nanval)
     {
         try {
-            int in_elements = in.info.dims[3] * in.info.strides[3];
+            int in_elements = in.info.dims[0] * in.info.dims[1] * in.info.dims[2] * in.info.dims[3];
 
             // FIXME: Use better heuristics to get to the optimum number
             if (in_elements > 4096) {
@@ -342,7 +342,8 @@ namespace kernel
             } else {
 
                 unique_ptr<Ti> h_ptr(new Ti[in_elements]);
-                getQueue().enqueueReadBuffer(*in.data, CL_TRUE, 0, sizeof(Ti) * in_elements, h_ptr.get());
+                getQueue().enqueueReadBuffer(*in.data, CL_TRUE, sizeof(Ti) * in.info.offset,
+                                             sizeof(Ti) * in_elements, h_ptr.get());
 
                 Transform<Ti, To, op> transform;
                 Binary<To, op> reduce;
diff --git a/test/ireduce.cpp b/test/ireduce.cpp
index 18461c5ee5..17abfcf8e4 100644
--- a/test/ireduce.cpp
+++ b/test/ireduce.cpp
@@ -12,6 +12,7 @@
 #include <af/arith.h>
 #include <af/data.h>
 #include <testHelpers.hpp>
+#include <algorithm>
 
 using namespace std;
 using namespace af;
@@ -98,3 +99,47 @@ MINMAXOP(max, int)
 MINMAXOP(max, uint)
 MINMAXOP(max, char)
 MINMAXOP(max, uchar)
+
+TEST(ImaxAll, IndexedSmall)
+{
+    const int num = 1000;
+    const int st = 10;
+    const int en = num - 100;
+    af::array a = af::randu(num);
+
+    float b;
+    unsigned idx;
+    af::max<float>(&b, &idx, a(af::seq(st, en)));
+
+    std::vector<float> ha(num);
+    a.host(&ha[0]);
+
+    float res = ha[st];
+    for (int i = st; i <= en; i++) {
+        res = std::max(res, ha[i]);
+    }
+
+    ASSERT_EQ(b, res);
+}
+
+TEST(ImaxAll, IndexedBig)
+{
+    const int num = 100000;
+    const int st = 1000;
+    const int en = num - 1000;
+    af::array a = af::randu(num);
+
+    float b;
+    unsigned idx;
+    af::max<float>(&b, &idx, a(af::seq(st, en)));
+
+    std::vector<float> ha(num);
+    a.host(&ha[0]);
+
+    float res = ha[st];
+    for (int i = st; i <= en; i++) {
+        res = std::max(res, ha[i]);
+    }
+
+    ASSERT_EQ(b, res);
+}
diff --git a/test/reduce.cpp b/test/reduce.cpp
index ad99430e17..000f1ea961 100644
--- a/test/reduce.cpp
+++ b/test/reduce.cpp
@@ -15,6 +15,7 @@
 #include <iostream>
 #include <string>
 #include <testHelpers.hpp>
+#include <algorithm>
 
 using std::vector;
 using std::string;
@@ -522,3 +523,41 @@ TEST(AnyAll, NaN)
     ASSERT_EQ(af::anyTrue<bool>(A), true);
     ASSERT_EQ(af::allTrue<bool>(A), false);
 }
+
+TEST(MaxAll, IndexedSmall)
+{
+    const int num = 1000;
+    const int st = 10;
+    const int en = num - 100;
+    af::array a = af::randu(num);
+    float b = af::max<float>(a(af::seq(st, en)));
+
+    std::vector<float> ha(num);
+    a.host(&ha[0]);
+
+    float res = ha[st];
+    for (int i = st; i <= en; i++) {
+        res = std::max(res, ha[i]);
+    }
+
+    ASSERT_EQ(b, res);
+}
+
+TEST(MaxAll, IndexedBig)
+{
+    const int num = 100000;
+    const int st = 1000;
+    const int en = num - 1000;
+    af::array a = af::randu(num);
+    float b = af::max<float>(a(af::seq(st, en)));
+
+    std::vector<float> ha(num);
+    a.host(&ha[0]);
+
+    float res = ha[st];
+    for (int i = st; i <= en; i++) {
+        res = std::max(res, ha[i]);
+    }
+
+    ASSERT_EQ(b, res);
+}