feelingstack
diff --git a/‎src/cudamatrix/cu-kernels-ansi.h‎
Lines changed: 0 additions & 4 deletions b/‎src/cudamatrix/cu-kernels-ansi.h‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/cudamatrix/cu-kernels.cu‎
Lines changed: 5 additions & 54 deletions b/‎src/cudamatrix/cu-kernels.cu‎
Lines changed: 5 additions & 54 deletions
diff --git a/‎src/cudamatrix/cu-kernels.h‎
Lines changed: 0 additions & 4 deletions b/‎src/cudamatrix/cu-kernels.h‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/cudamatrix/cu-matrix-speed-test.cc‎
Lines changed: 65 additions & 1 deletion b/‎src/cudamatrix/cu-matrix-speed-test.cc‎
Lines changed: 65 additions & 1 deletion
diff --git a/‎src/cudamatrix/cu-matrix.cc‎
Lines changed: 5 additions & 5 deletions b/‎src/cudamatrix/cu-matrix.cc‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/cudamatrix/cu-matrix.h‎
Lines changed: 4 additions & 0 deletions b/‎src/cudamatrix/cu-matrix.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/cudamatrix/cu-sp-matrix-speed-test.cc‎
Lines changed: 2 additions & 2 deletions b/‎src/cudamatrix/cu-sp-matrix-speed-test.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/cudamatrix/cu-vector-speed-test.cc‎
Lines changed: 46 additions & 17 deletions b/‎src/cudamatrix/cu-vector-speed-test.cc‎
Lines changed: 46 additions & 17 deletions
@@ -99,8 +99,6 @@ void cudaF_vec_min(const float* v, float* value, int dim);
 void cudaF_vec_max(const float* v, float* value, int dim);
 void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
 void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
-void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
-void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
 void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
                             int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
                             int N_col_stride, int threads_per_element, float beta);  
@@ -229,8 +227,6 @@ void cudaD_vec_min(const double* v, double* value, int dim);
 void cudaD_vec_max(const double* v, double* value, int dim);
 void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
 void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
-void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
-void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
 void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
                             int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
                             int N_col_stride, int threads_per_element, double beta);  
 
@@ -178,8 +178,8 @@ static void _add_diag_vec_mat(Real alpha, Real *mat, MatrixDim mat_dim,
   // Note from Dan: in this kernel, we make the x dimension correspond to the
   // row index and y to the column index.  That was not always the case for
   // earlier kernels written by others.
-  int i = blockIdx.x * blockDim.x + threadIdx.x; // row index
-  int j = blockIdx.y * blockDim.y + threadIdx.y; // column index
+  int i = blockIdx.y * blockDim.y + threadIdx.y; // row index
+  int j = blockIdx.x * blockDim.x + threadIdx.x; // column index
 
   int index = i * mat_dim.stride + j,
       index2 = i * mat2_row_stride + j * mat2_col_stride;
@@ -809,47 +809,13 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA, int
 }
 
 
-template<typename Real>
-__global__
-static void _add_diag_mat(Real alpha, Real* v, const Real* mat, Real beta, MatrixDim dmat, int dim) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (i < dim) {
-    Real sum = 0.0;
-    for (int32_cuda j = 0; j < dmat.cols; j++) {
-      int32_cuda index = j + i * dmat.stride;
-      sum += mat[index] * mat[index];
-    }
-    v[i] = beta * v[i] + alpha * sum;
-  }
-}
-
-
-template<typename Real>
-__global__
-static void _add_diag_mat_trans(Real alpha, Real* v, const Real* mat, Real beta, MatrixDim dmat, int dim) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  //  if (blockIdx.y > 0)    return;
-
-  if (i < dim) {
-    Real sum = 0.0;
-    for (int32_cuda j = 0; j < dmat.rows; j++) {
-      int32_cuda index = i + j * dmat.stride;
-      sum += mat[index] * mat[index];
-    }
-    v[i] = beta * v[i] + alpha * sum;
-  } 
-}
-
 // Adds diag(M N) to v, where M and N are matrices.  We supply row_stride and
 // col_stride arguments for M and N, and swapping them allows us to transpose
 // those matrices.  Note: we imagine row-major indexing here, just like Kaldi 
 // and CBLAS (but unlike CUBLAS).
 // This kernel expects the blockDim to be (CU1DBLOCK, 1) and the
-// gridDim times CU1DBLOCK to be at least num-rows-of-v, but if the gridDim
-// times CU1DBLOCK is larger than that, it will make good use of the
-// extra threads.  Note: for best efficiency, the gridDim should be approximately
-// (num-rows-of-v / CU1DBLOCK) times a power of 2.
+// gridDim times CU1DBLOCK to be at least num-rows-of-v * threads_per_element.
+// threads_per_element should be a power of 2.
 template<typename Real>
 __global__
 static void _add_diag_mat_mat(
@@ -862,7 +828,7 @@ static void _add_diag_mat_mat(
   __shared__ Real temp_data[CU1DBLOCK];
 
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int v_idx = i / threads_per_element,   // v_ids is the index into v that we are supposed to
+  int v_idx = i / threads_per_element,   // v_idx is the index into v that we are supposed to
       sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells 
                                          // us which block of elements we sum up.
   if (v_idx >= v_dim) return;
@@ -2150,13 +2116,6 @@ void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_str
   _trace_mat_mat<float,2> <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value);
 }
 
-void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta,  MatrixDim dmat, int dim) {
-  _add_diag_mat_trans<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
-}
-
-void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta,  MatrixDim dmat, int dim) {
-  _add_diag_mat<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
-}
 
 void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
      int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
@@ -2571,14 +2530,6 @@ void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_s
   _trace_mat_mat<double,2> <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value);
 }
 
-void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta,  MatrixDim dmat, int dim) {
-  _add_diag_mat_trans<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
-}
-
-void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta,  MatrixDim dmat, int dim) {
-  _add_diag_mat<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
-}
-
 void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
      int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
      int N_col_stride, int threads_per_element, double beta) {
 
@@ -141,14 +141,12 @@ inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(
 inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
 inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
 inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
-inline void cuda_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
 inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
                                   int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
                                   int N_col_stride, int threads_per_element, float beta) {
   cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
                          N_col_stride, threads_per_element, beta);
 }
-inline void cuda_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
 inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
 inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
 inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
@@ -311,14 +309,12 @@ inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_mi
 inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
 inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
 inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
-inline void cuda_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
 inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
                                   int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
                                   int N_col_stride, int threads_per_element, double beta) {
   cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
                          N_col_stride, threads_per_element, beta);
 }
-inline void cuda_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
 inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
 inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
 inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
 
@@ -57,6 +57,26 @@ template<typename Real> void TestCuMatrixMatMat(int32 dim) {
             << dim << ", speed was " << gflops << " gigaflops.";
 }
 
+template<typename Real> void TestCuMatrixAddDiagVecMat(int32 dim, MatrixTransposeType trans) {
+  BaseFloat time_in_secs = 0.015;
+  CuMatrix<Real> M(dim, dim), N(dim, dim);
+  CuVector<Real> v(dim);
+  M.SetRandn();
+  v.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++)
+    N.AddDiagVecMat(1.0, v, M, trans, 0.0);
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::AddDiagVecMat" << NameOf<Real>()
+            << (trans == kTrans ? "[trans]" : "[no-trans]")
+            << ", for dim = " << dim << ", speed was "
+            << gflops << " gigaflops.";
+}
+
+
 
 template<typename Real> void TestSymInvertPosDef(int32 dim) {
   BaseFloat time_in_secs = 0.025;
@@ -222,7 +242,7 @@ template<typename Real> void TestCuMatrixMulRowsGroupMat(int32 dim) {
 
 template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
   BaseFloat time_in_secs = 0.025;
-  CuMatrix<Real> M(256, dim), N(256, dim);
+  CuMatrix<Real> M(dim, dim), N(dim, dim);
   M.SetRandn();
   N.SetRandn();
   Timer tim;
@@ -237,6 +257,42 @@ template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
             << dim << ", speed was " << gflops << " gigaflops.";
 }
 
+
+template<typename Real> void TestCuMatrixGroupPnorm(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  int32 group_size = 4;
+  CuMatrix<Real> M(dim, dim), N(dim, dim / group_size);
+  M.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++)
+    N.GroupPnorm(M, 2.0);
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::GroupPnorm" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+template<typename Real> void TestCuMatrixGroupPnormDeriv(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  int32 group_size = 4;
+  CuMatrix<Real> M(dim, dim), N(dim, dim / group_size), O(dim, dim);
+  M.SetRandn();
+  N.GroupPnorm(M, 2.0);  
+  Timer tim;
+  int32 iter = 0;
+
+  for (;tim.Elapsed() < time_in_secs; iter++)
+    O.GroupPnormDeriv(M, N, 2.0);
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::GroupPnormDeriv" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
 template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
   for (int32 n = 0; n < 2; n++) {
     MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
@@ -388,6 +444,10 @@ template<typename Real> void CudaMatrixSpeedTest() {
   int32 ns = sizes.size();
   for (int32 s = 0; s < ns; s++)
     TestCuMatrixMatMat<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++) {
+    TestCuMatrixAddDiagVecMat<Real>(sizes[s], kNoTrans);
+    TestCuMatrixAddDiagVecMat<Real>(sizes[s], kTrans);
+  }
   for (int32 s = 0; s < ns; s++)
     TestSymInvertPosDef<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
@@ -402,6 +462,10 @@ template<typename Real> void CudaMatrixSpeedTest() {
     TestCuMatrixMulRowsGroupMat<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuMatrixSoftmax<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixGroupPnorm<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixGroupPnormDeriv<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuMatrixTraceMatMat<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
 
@@ -988,13 +988,13 @@ void CuMatrixBase<Real>::AddDiagVecMat(
 
     Timer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    // Caution, this dimGrid is not the same way around as much of the other
-    // code: going forward, I want to use the (rows, cols) order.
-    dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK));
 
-    MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
-    if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
+    dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
+                 n_blocks(num_rows_, CU2DBLOCK));
 
+    MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
+    if (transM == kTrans)
+      std::swap(M_row_stride, M_col_stride);
     cuda_add_diag_vec_mat(dimGrid, dimBlock, alpha, data_, Dim(),
                           v.Data(), M.Data(), M_row_stride, M_col_stride, beta);
     CU_SAFE_CALL(cudaGetLastError());
 
@@ -119,6 +119,8 @@ class CuMatrixBase {
                                 const CuMatrixBase<Real> &B,
                                 MatrixTransposeType trans);
 
+  /// Adds "value" to the diagonal elements of the matrix.  The matrix
+  /// *this does not have to be square.
   void AddToDiag(Real value);
 
   /// Dimensions
@@ -183,6 +185,8 @@ class CuMatrixBase {
 
   /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
   /// where G = x.NumCols() / y.NumCols() must be an integer.
+  /// [note: y corresponds to *this and x to src, so
+  ///  src.NumCols() / this->NumCols() must be an integer.
   void GroupPnorm(const CuMatrixBase<Real> &src, Real pow);
 
   /// Calculate derivatives for the GroupPnorm function above...
 
@@ -41,7 +41,7 @@ std::string NameOf() {
 
 template<typename Real>
 static void UnitTestCuSpMatrixInvert(int32 dim) {
-  BaseFloat time_in_secs = 0.5;
+  BaseFloat time_in_secs = 0.01;
   int32 iter = 0;
   Timer tim;
   CuSpMatrix<Real> A(dim);
@@ -82,7 +82,7 @@ static void UnitTestCuSpMatrixInvert(int32 dim) {
 
 template<typename Real>
 static void UnitTestCuSpMatrixCopyFromMat(int32 dim, SpCopyType copy_type) {
-  BaseFloat time_in_secs = 0.1;
+  BaseFloat time_in_secs = 0.05;
   int32 iter = 0;
   Timer tim;
   CuMatrix<Real> A(dim, dim);
 
@@ -96,23 +96,50 @@ template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
 
 
 
-template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
+template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim,
+                                                       MatrixTransposeType transN,
+                                                       MatrixTransposeType transO) {
   BaseFloat time_in_secs = 0.05;
   CuVector<Real> v(dim);
   v.SetRandn();
   CuMatrix<Real> N(dim, dim), O(dim, dim);
-  N.SetRandn(); O.SetRandn();
+  N.SetRandn();
+  O.SetRandn();
 
   Timer tim;
   int32 iter = 0;
 
   for (;tim.Elapsed() < time_in_secs; iter++) {
-    v.AddDiagMatMat(1.0, N, kNoTrans, O, kNoTrans, 1.0);
+    v.AddDiagMatMat(1.0, N, transN, O, transO, 1.0);
   }
 
   BaseFloat fdim = dim;
   BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
-  KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>() << ", for dim = "
+  KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>()
+            << (transN == kNoTrans ? "[no-trans],":"[trans],")
+            << (transO == kNoTrans ? "[no-trans],":"[trans],")
+            << " for dim = "<< dim << ", speed was " << gflops << " gigaflops.";
+}
+
+
+template<typename Real> void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeType trans) {
+  BaseFloat time_in_secs = 0.05;
+  CuVector<Real> v(dim);
+  v.SetRandn();
+  CuMatrix<Real> N(dim, dim);
+  N.SetRandn();
+
+  Timer tim;
+  int32 iter = 0;
+  
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    v.AddDiagMat2(1.0, N, trans, 0.0);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::AddDiagMat2" << NameOf<Real>()
+            << (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = "
             << dim << ", speed was " << gflops << " gigaflops.";
 }
 
@@ -121,25 +148,27 @@ template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
 template<typename Real> void CudaVectorSpeedTest() {
   std::vector<int32> sizes;
   sizes.push_back(16);
+  sizes.push_back(32);
+  sizes.push_back(64);
   sizes.push_back(128);
   sizes.push_back(256);
   sizes.push_back(1024);
   int32 ns = sizes.size();
+  for (int32 s = 0; s < ns; s++)
+    TestCuVectorSoftmax<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuVectorSum<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuVectorVecVecOne<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++) {
-	  TestCuVectorSoftmax<Real>(sizes[s]);
-  }
-
-
-  for (int32 s = 0; s < ns; s++) {
-          TestCuVectorSum<Real>(sizes[s]);
-  }
-
-  for (int32 s = 0; s < ns; s++) {
-          TestCuVectorVecVecOne<Real>(sizes[s]);
+    TestCuVectorAddDiagMatMat<Real>(sizes[s], kNoTrans, kNoTrans);
+    TestCuVectorAddDiagMatMat<Real>(sizes[s], kNoTrans, kTrans);
+    TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kNoTrans);
+    TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kTrans);
   }
-
-  for (int32 s = 0; s < ns; s++) {
-    TestCuVectorAddDiagMatMat<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++) { 
+    TestCuVectorAddDiagMat2<Real>(sizes[s], kNoTrans);
+    TestCuVectorAddDiagMat2<Real>(sizes[s], kTrans);
   }
 
 }