feiyunwill
diff --git a/‎src/api/c/binary.cpp‎
Lines changed: 53 additions & 33 deletions b/‎src/api/c/binary.cpp‎
Lines changed: 53 additions & 33 deletions
diff --git a/‎src/backend/cpu/kernel/sparse_arith.hpp‎
Lines changed: 101 additions & 0 deletions b/‎src/backend/cpu/kernel/sparse_arith.hpp‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎src/backend/cpu/sparse_arith.cpp‎
Lines changed: 49 additions & 7 deletions b/‎src/backend/cpu/sparse_arith.cpp‎
Lines changed: 49 additions & 7 deletions
diff --git a/‎src/backend/cpu/sparse_arith.hpp‎
Lines changed: 5 additions & 1 deletion b/‎src/backend/cpu/sparse_arith.hpp‎
Lines changed: 5 additions & 1 deletion
@@ -35,6 +35,14 @@ static inline af_array arithOp(const af_array lhs, const af_array rhs,
     return res;
 }
 
+template<typename T, af_op_t op>
+static inline
+af_array sparseArithOp(const af_array lhs, const af_array rhs)
+{
+    auto res = arithOp<T, op>(getSparseArray<T>(lhs), getSparseArray<T>(rhs));
+    return getHandle(res);
+}
+
 template<typename T, af_op_t op>
 static inline af_array arithSparseDenseOp(const af_array lhs, const af_array rhs,
                                           const bool reverse)
@@ -80,10 +88,11 @@ static af_err af_arith(af_array *out, const af_array lhs, const af_array rhs, co
 }
 
 template<af_op_t op>
-static af_err af_arith_real(af_array *out, const af_array lhs, const af_array rhs, const bool batchMode)
+static
+af_err af_arith_real(af_array *out, const af_array lhs, const af_array rhs,
+                     const bool batchMode)
 {
     try {
-
         const ArrayInfo& linfo = getInfo(lhs);
         const ArrayInfo& rinfo = getInfo(rhs);
 
@@ -111,38 +120,41 @@ static af_err af_arith_real(af_array *out, const af_array lhs, const af_array rh
     return AF_SUCCESS;
 }
 
-//template<af_op_t op>
-//static af_err af_arith_sparse(af_array *out, const af_array lhs, const af_array rhs)
-//{
-//    try {
-//        SparseArrayBase linfo = getSparseArrayBase(lhs);
-//        SparseArrayBase rinfo = getSparseArrayBase(rhs);
-//
-//        dim4 odims = getOutDims(linfo.dims(), rinfo.dims(), batchMode);
-//
-//        const af_dtype otype = implicit(linfo.getType(), rinfo.getType());
-//        af_array res;
-//        switch (otype) {
-//        case f32: res = arithOp<float  , op>(lhs, rhs, odims); break;
-//        case f64: res = arithOp<double , op>(lhs, rhs, odims); break;
-//        case c32: res = arithOp<cfloat , op>(lhs, rhs, odims); break;
-//        case c64: res = arithOp<cdouble, op>(lhs, rhs, odims); break;
-//        default: TYPE_ERROR(0, otype);
-//        }
-//
-//        std::swap(*out, res);
-//    }
-//    CATCHALL;
-//    return AF_SUCCESS;
-//}
+template<af_op_t op>
+static af_err
+af_arith_sparse(af_array *out, const af_array lhs, const af_array rhs)
+{
+    try {
+        common::SparseArrayBase linfo = getSparseArrayBase(lhs);
+        common::SparseArrayBase rinfo = getSparseArrayBase(rhs);
+
+        ARG_ASSERT(1, (linfo.getStorage()==rinfo.getStorage()));
+        ARG_ASSERT(1, (linfo.dims()==rinfo.dims()));
+        ARG_ASSERT(1, (linfo.getStorage()==AF_STORAGE_CSR));
+
+        const af_dtype otype = implicit(linfo.getType(), rinfo.getType());
+        af_array res;
+        switch (otype) {
+            case f32: res = sparseArithOp<float  , op>(lhs, rhs); break;
+            case f64: res = sparseArithOp<double , op>(lhs, rhs); break;
+            case c32: res = sparseArithOp<cfloat , op>(lhs, rhs); break;
+            case c64: res = sparseArithOp<cdouble, op>(lhs, rhs); break;
+            default: TYPE_ERROR(0, otype);
+        }
+
+        std::swap(*out, res);
+    }
+    CATCHALL;
+    return AF_SUCCESS;
+}
 
 template<af_op_t op>
 static af_err af_arith_sparse_dense(af_array *out, const af_array lhs, const af_array rhs,
                                     const bool reverse = false)
 {
     using namespace common;
     try {
-        SparseArrayBase linfo = getSparseArrayBase(lhs);
+        common::SparseArrayBase linfo = getSparseArrayBase(lhs);
         ArrayInfo       rinfo = getInfo(rhs);
 
         const af_dtype otype = implicit(linfo.getType(), rinfo.getType());
@@ -161,18 +173,20 @@ static af_err af_arith_sparse_dense(af_array *out, const af_array lhs, const af_
     return AF_SUCCESS;
 }
 
-af_err af_add(af_array *out, const af_array lhs, const af_array rhs, const bool batchMode)
+af_err af_add(af_array *out, const af_array lhs, const af_array rhs,
+              const bool batchMode)
 {
     // Check if inputs are sparse
     ArrayInfo linfo = getInfo(lhs, false, true);
     ArrayInfo rinfo = getInfo(rhs, false, true);
 
     if(linfo.isSparse() && rinfo.isSparse()) {
-        return AF_ERR_NOT_SUPPORTED; //af_arith_sparse<af_add_t>(out, lhs, rhs);
+        return af_arith_sparse<af_add_t>(out, lhs, rhs);
     } else if(linfo.isSparse() && !rinfo.isSparse()) {
         return af_arith_sparse_dense<af_add_t>(out, lhs, rhs);
     } else if(!linfo.isSparse() && rinfo.isSparse()) {
-        return af_arith_sparse_dense<af_add_t>(out, rhs, lhs, true); // dense should be rhs
+        // second operand(Array) of af_arith call should be dense
+        return af_arith_sparse_dense<af_add_t>(out, rhs, lhs, true);
     } else {
         return af_arith<af_add_t>(out, lhs, rhs, batchMode);
     }
@@ -185,7 +199,10 @@ af_err af_mul(af_array *out, const af_array lhs, const af_array rhs, const bool
     ArrayInfo rinfo = getInfo(rhs, false, true);
 
     if(linfo.isSparse() && rinfo.isSparse()) {
-        return AF_ERR_NOT_SUPPORTED; //af_arith_sparse<af_mul_t>(out, lhs, rhs);
+        //return af_arith_sparse<af_mul_t>(out, lhs, rhs);
+        //MKL doesn't have mul or div support yet, hence
+        //this is commented out although alternative cpu code exists
+        return AF_ERR_NOT_SUPPORTED;
     } else if(linfo.isSparse() && !rinfo.isSparse()) {
         return af_arith_sparse_dense<af_mul_t>(out, lhs, rhs);
     } else if(!linfo.isSparse() && rinfo.isSparse()) {
@@ -202,7 +219,7 @@ af_err af_sub(af_array *out, const af_array lhs, const af_array rhs, const bool
     ArrayInfo rinfo = getInfo(rhs, false, true);
 
     if(linfo.isSparse() && rinfo.isSparse()) {
-        return AF_ERR_NOT_SUPPORTED; //af_arith_sparse<af_sub_t>(out, lhs, rhs);
+        return af_arith_sparse<af_sub_t>(out, lhs, rhs);
     } else if(linfo.isSparse() && !rinfo.isSparse()) {
         return af_arith_sparse_dense<af_sub_t>(out, lhs, rhs);
     } else if(!linfo.isSparse() && rinfo.isSparse()) {
@@ -219,7 +236,10 @@ af_err af_div(af_array *out, const af_array lhs, const af_array rhs, const bool
     ArrayInfo rinfo = getInfo(rhs, false, true);
 
     if(linfo.isSparse() && rinfo.isSparse()) {
-        return AF_ERR_NOT_SUPPORTED; //af_arith_sparse<af_div_t>(out, lhs, rhs);
+        //return af_arith_sparse<af_div_t>(out, lhs, rhs);
+        //MKL doesn't have mul or div support yet, hence
+        //this is commented out although alternative cpu code exists
+        return AF_ERR_NOT_SUPPORTED;
     } else if(linfo.isSparse() && !rinfo.isSparse()) {
         return af_arith_sparse_dense<af_div_t>(out, lhs, rhs);
     } else if(!linfo.isSparse() && rinfo.isSparse()) {
 
@@ -11,6 +11,8 @@
 #include <Param.hpp>
 #include <math.hpp>
 
+#include <cmath>
+
 namespace cpu
 {
 namespace kernel
@@ -143,5 +145,104 @@ void sparseArithOpS(Param<T> values, Param<int> rowIdx, Param<int> colIdx,
     }
 }
 
+// The following functions can handle CSR
+// storage format only as of now.
+static
+void calcOutNNZ(Param<int> outRowIdx,
+                const uint M, const uint N,
+                CParam<int> lRowIdx, CParam<int> lColIdx,
+                CParam<int> rRowIdx, CParam<int> rColIdx)
+{
+          int *orPtr = outRowIdx.get();
+    const int *lrPtr = lRowIdx.get();
+    const int *lcPtr = lColIdx.get();
+    const int *rrPtr = rRowIdx.get();
+    const int *rcPtr = rColIdx.get();
+
+    unsigned csrOutCount = 0;
+    for (uint row=0; row<M; ++row) {
+        const int lEnd = lrPtr[row+1];
+        const int rEnd = rrPtr[row+1];
+
+        uint rowNNZ = 0;
+        int l = lrPtr[row];
+        int r = rrPtr[row];
+        while (l < lEnd && r < rEnd) {
+            int lci = lcPtr[l];
+            int rci = rcPtr[r];
+
+            l += (lci <= rci);
+            r += (lci >= rci);
+            rowNNZ++;
+        }
+        // Elements from lhs or rhs are exhausted.
+        // Just count left over elements
+        rowNNZ += (lEnd-l);
+        rowNNZ += (rEnd-r);
+
+        orPtr[row] = csrOutCount;
+        csrOutCount += rowNNZ;
+    }
+    //Write out the Rows+1 entry
+    orPtr[M] = csrOutCount;
+}
+
+template<typename T, af_op_t op>
+void sparseArithOp(Param<T> oVals, Param<int> oColIdx,
+                   CParam<int> oRowIdx, const uint Rows,
+                   CParam<T> lvals, CParam<int> lRowIdx, CParam<int> lColIdx,
+                   CParam<T> rvals, CParam<int> rRowIdx, CParam<int> rColIdx)
+{
+    const int *orPtr = oRowIdx.get();
+    const   T *lvPtr = lvals.get();
+    const int *lrPtr = lRowIdx.get();
+    const int *lcPtr = lColIdx.get();
+    const   T *rvPtr = rvals.get();
+    const int *rrPtr = rRowIdx.get();
+    const int *rcPtr = rColIdx.get();
+
+    arith_op<T, op> binOp;
+
+    auto ZERO = scalar<T>(0);
+
+    for (uint row=0; row<Rows; ++row) {
+        const int lEnd = lrPtr[row+1];
+        const int rEnd = rrPtr[row+1];
+        const int offs = orPtr[row];
+
+          T *ovPtr = oVals.get() + offs;
+        int *ocPtr = oColIdx.get() + offs;
+
+        uint rowNNZ = 0;
+        int l = lrPtr[row];
+        int r = rrPtr[row];
+        while (l < lEnd && r < rEnd) {
+            int lci = lcPtr[l];
+            int rci = rcPtr[r];
+
+            T lhs = (lci <= rci ? lvPtr[l] : ZERO);
+            T rhs = (lci >= rci ? rvPtr[r] : ZERO);
+
+            ovPtr[ rowNNZ ] = binOp(lhs, rhs);
+            ocPtr[ rowNNZ ] = (lci <= rci) ? lci : rci;
+
+            l += (lci <= rci);
+            r += (lci >= rci);
+            rowNNZ++;
+        }
+        while (l < lEnd) {
+            ovPtr[ rowNNZ ] = binOp(lvPtr[l], ZERO);
+            ocPtr[ rowNNZ ] = lcPtr[l];
+            l++;
+            rowNNZ++;
+        }
+        while (r < rEnd) {
+            ovPtr[ rowNNZ ] = binOp(ZERO, rvPtr[r]);
+            ocPtr[ rowNNZ ] = rcPtr[r];
+            r++;
+            rowNNZ++;
+        }
+    }
+}
 }
 }
@@ -9,14 +9,8 @@
 
 #include <sparse_arith.hpp>
 #include <common/SparseArray.hpp>
-#include <optypes.hpp>
 #include <sparse.hpp>
-
-#include <kernel/sparse_arith.hpp>
-
-#include <stdexcept>
-#include <string>
-
+#include <optypes.hpp>
 #include <af/dim4.hpp>
 #include <arith.hpp>
 #include <complex.hpp>
@@ -26,6 +20,13 @@
 #include <platform.hpp>
 #include <queue.hpp>
 
+#include <kernel/sparse_arith.hpp>
+
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
 namespace cpu
 {
 
@@ -115,6 +116,39 @@ SparseArray<T> arithOpS(const SparseArray<T> &lhs, const Array<T> &rhs, const bo
     return out;
 }
 
+template<typename T, af_op_t op>
+SparseArray<T> arithOp(const SparseArray<T> &lhs, const SparseArray<T> &rhs)
+{
+    af::storage sfmt = lhs.getStorage();
+
+    lhs.eval();
+    rhs.eval();
+
+    const dim4 dims = lhs.dims();
+    const uint M = dims[0];
+    const uint N = dims[1];
+
+    auto rowArr = createEmptyArray<int>(dim4(M+1));
+
+    getQueue().enqueue(kernel::calcOutNNZ, rowArr, M, N,
+                       lhs.getRowIdx(), lhs.getColIdx(),
+                       rhs.getRowIdx(), rhs.getColIdx());
+    getQueue().sync();
+
+    uint nnz = rowArr.get()[M];
+    auto out = createEmptySparseArray<T>(dims, nnz, sfmt);
+    out.eval();
+
+    copyArray(out.getRowIdx(), rowArr);
+
+    getQueue().enqueue(kernel::sparseArithOp<T, op>,
+                       out.getValues(), out.getColIdx(),
+                       out.getRowIdx(), M,
+                       lhs.getValues(), lhs.getRowIdx(), lhs.getColIdx(),
+                       rhs.getValues(), rhs.getRowIdx(), rhs.getColIdx());
+    return out;
+}
+
 #define INSTANTIATE(T)                                                                              \
     template Array<T> arithOpD<T, af_add_t>(const SparseArray<T> &lhs, const Array<T> &rhs,         \
                                             const bool reverse);                                    \
@@ -132,6 +166,14 @@ SparseArray<T> arithOpS(const SparseArray<T> &lhs, const Array<T> &rhs, const bo
                                                   const bool reverse);                              \
     template SparseArray<T> arithOpS<T, af_div_t>(const SparseArray<T> &lhs, const Array<T> &rhs,   \
                                                  const bool reverse);                               \
+    template SparseArray<T> arithOp<T, af_add_t>(const common::SparseArray<T> &lhs,                 \
+                                                 const common::SparseArray<T> &rhs);                \
+    template SparseArray<T> arithOp<T, af_sub_t>(const common::SparseArray<T> &lhs,                 \
+                                                 const common::SparseArray<T> &rhs);                \
+    template SparseArray<T> arithOp<T, af_mul_t>(const common::SparseArray<T> &lhs,                 \
+                                                 const common::SparseArray<T> &rhs);                \
+    template SparseArray<T> arithOp<T, af_div_t>(const common::SparseArray<T> &lhs,                 \
+                                                 const common::SparseArray<T> &rhs);
 
 INSTANTIATE(float  )
 INSTANTIATE(double )
 
@@ -7,14 +7,15 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+
 #include <Array.hpp>
 #include <common/SparseArray.hpp>
 #include <sparse.hpp>
 #include <optypes.hpp>
 
 namespace cpu
 {
-
 // These two functions cannot be overloaded by return type.
 // So have to give them separate names.
 template<typename T, af_op_t op>
@@ -25,4 +26,7 @@ template<typename T, af_op_t op>
 common::SparseArray<T> arithOpS(const common::SparseArray<T> &lhs, const Array<T> &rhs,
                                 const bool reverse = false);
 
+template<typename T, af_op_t op>
+common::SparseArray<T> arithOp(const common::SparseArray<T> &lhs,
+                               const common::SparseArray<T> &rhs);
 }