Skip to content

Commit b1dc770

Browse files
committed
trunk: changes to Dan's neural net setup, with new preconditioning method (speed roughly doubled if you use train_pnorm_online.sh, which uses the new preconditioning method). Various bug-fixes, optimizations and cleanups in matrix code, cuda-matrix code and thread code. Still tuning this so recipes not checked in yet.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4077 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
1 parent b03ef02 commit b1dc770

36 files changed

+1985
-725
lines changed

src/cudamatrix/cu-kernels-ansi.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,6 @@ void cudaF_vec_min(const float* v, float* value, int dim);
9999
void cudaF_vec_max(const float* v, float* value, int dim);
100100
void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
101101
void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
102-
void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
103-
void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
104102
void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
105103
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
106104
int N_col_stride, int threads_per_element, float beta);
@@ -229,8 +227,6 @@ void cudaD_vec_min(const double* v, double* value, int dim);
229227
void cudaD_vec_max(const double* v, double* value, int dim);
230228
void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
231229
void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
232-
void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
233-
void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
234230
void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
235231
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
236232
int N_col_stride, int threads_per_element, double beta);

src/cudamatrix/cu-kernels.cu

Lines changed: 5 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ static void _add_diag_vec_mat(Real alpha, Real *mat, MatrixDim mat_dim,
178178
// Note from Dan: in this kernel, we make the x dimension correspond to the
179179
// row index and y to the column index. That was not always the case for
180180
// earlier kernels written by others.
181-
int i = blockIdx.x * blockDim.x + threadIdx.x; // row index
182-
int j = blockIdx.y * blockDim.y + threadIdx.y; // column index
181+
int i = blockIdx.y * blockDim.y + threadIdx.y; // row index
182+
int j = blockIdx.x * blockDim.x + threadIdx.x; // column index
183183

184184
int index = i * mat_dim.stride + j,
185185
index2 = i * mat2_row_stride + j * mat2_col_stride;
@@ -809,47 +809,13 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA, int
809809
}
810810

811811

812-
template<typename Real>
813-
__global__
814-
static void _add_diag_mat(Real alpha, Real* v, const Real* mat, Real beta, MatrixDim dmat, int dim) {
815-
int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
816-
817-
if (i < dim) {
818-
Real sum = 0.0;
819-
for (int32_cuda j = 0; j < dmat.cols; j++) {
820-
int32_cuda index = j + i * dmat.stride;
821-
sum += mat[index] * mat[index];
822-
}
823-
v[i] = beta * v[i] + alpha * sum;
824-
}
825-
}
826-
827-
828-
template<typename Real>
829-
__global__
830-
static void _add_diag_mat_trans(Real alpha, Real* v, const Real* mat, Real beta, MatrixDim dmat, int dim) {
831-
int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
832-
// if (blockIdx.y > 0) return;
833-
834-
if (i < dim) {
835-
Real sum = 0.0;
836-
for (int32_cuda j = 0; j < dmat.rows; j++) {
837-
int32_cuda index = i + j * dmat.stride;
838-
sum += mat[index] * mat[index];
839-
}
840-
v[i] = beta * v[i] + alpha * sum;
841-
}
842-
}
843-
844812
// Adds diag(M N) to v, where M and N are matrices. We supply row_stride and
845813
// col_stride arguments for M and N, and swapping them allows us to transpose
846814
// those matrices. Note: we imagine row-major indexing here, just like Kaldi
847815
// and CBLAS (but unlike CUBLAS).
848816
// This kernel expects the blockDim to be (CU1DBLOCK, 1) and the
849-
// gridDim times CU1DBLOCK to be at least num-rows-of-v, but if the gridDim
850-
// times CU1DBLOCK is larger than that, it will make good use of the
851-
// extra threads. Note: for best efficiency, the gridDim should be approximately
852-
// (num-rows-of-v / CU1DBLOCK) times a power of 2.
817+
// gridDim times CU1DBLOCK to be at least num-rows-of-v * threads_per_element.
818+
// threads_per_element should be a power of 2.
853819
template<typename Real>
854820
__global__
855821
static void _add_diag_mat_mat(
@@ -862,7 +828,7 @@ static void _add_diag_mat_mat(
862828
__shared__ Real temp_data[CU1DBLOCK];
863829

864830
int i = blockIdx.x * blockDim.x + threadIdx.x;
865-
int v_idx = i / threads_per_element, // v_ids is the index into v that we are supposed to
831+
int v_idx = i / threads_per_element, // v_idx is the index into v that we are supposed to
866832
sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells
867833
// us which block of elements we sum up.
868834
if (v_idx >= v_dim) return;
@@ -2150,13 +2116,6 @@ void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_str
21502116
_trace_mat_mat<float,2> <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value);
21512117
}
21522118

2153-
void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) {
2154-
_add_diag_mat_trans<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
2155-
}
2156-
2157-
void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) {
2158-
_add_diag_mat<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
2159-
}
21602119

21612120
void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
21622121
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
@@ -2571,14 +2530,6 @@ void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_s
25712530
_trace_mat_mat<double,2> <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value);
25722531
}
25732532

2574-
void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) {
2575-
_add_diag_mat_trans<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
2576-
}
2577-
2578-
void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) {
2579-
_add_diag_mat<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
2580-
}
2581-
25822533
void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
25832534
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
25842535
int N_col_stride, int threads_per_element, double beta) {

src/cudamatrix/cu-kernels.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,12 @@ inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(
141141
inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
142142
inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
143143
inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
144-
inline void cuda_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
145144
inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
146145
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
147146
int N_col_stride, int threads_per_element, float beta) {
148147
cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
149148
N_col_stride, threads_per_element, beta);
150149
}
151-
inline void cuda_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
152150
inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
153151
inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
154152
inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
@@ -311,14 +309,12 @@ inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_mi
311309
inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
312310
inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
313311
inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
314-
inline void cuda_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
315312
inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
316313
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
317314
int N_col_stride, int threads_per_element, double beta) {
318315
cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
319316
N_col_stride, threads_per_element, beta);
320317
}
321-
inline void cuda_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
322318
inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
323319
inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
324320
inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }

src/cudamatrix/cu-matrix-speed-test.cc

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,26 @@ template<typename Real> void TestCuMatrixMatMat(int32 dim) {
5757
<< dim << ", speed was " << gflops << " gigaflops.";
5858
}
5959

60+
template<typename Real> void TestCuMatrixAddDiagVecMat(int32 dim, MatrixTransposeType trans) {
61+
BaseFloat time_in_secs = 0.015;
62+
CuMatrix<Real> M(dim, dim), N(dim, dim);
63+
CuVector<Real> v(dim);
64+
M.SetRandn();
65+
v.SetRandn();
66+
Timer tim;
67+
int32 iter = 0;
68+
for (;tim.Elapsed() < time_in_secs; iter++)
69+
N.AddDiagVecMat(1.0, v, M, trans, 0.0);
70+
71+
BaseFloat fdim = dim;
72+
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
73+
KALDI_LOG << "For CuMatrix::AddDiagVecMat" << NameOf<Real>()
74+
<< (trans == kTrans ? "[trans]" : "[no-trans]")
75+
<< ", for dim = " << dim << ", speed was "
76+
<< gflops << " gigaflops.";
77+
}
78+
79+
6080

6181
template<typename Real> void TestSymInvertPosDef(int32 dim) {
6282
BaseFloat time_in_secs = 0.025;
@@ -222,7 +242,7 @@ template<typename Real> void TestCuMatrixMulRowsGroupMat(int32 dim) {
222242

223243
template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
224244
BaseFloat time_in_secs = 0.025;
225-
CuMatrix<Real> M(256, dim), N(256, dim);
245+
CuMatrix<Real> M(dim, dim), N(dim, dim);
226246
M.SetRandn();
227247
N.SetRandn();
228248
Timer tim;
@@ -237,6 +257,42 @@ template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
237257
<< dim << ", speed was " << gflops << " gigaflops.";
238258
}
239259

260+
261+
template<typename Real> void TestCuMatrixGroupPnorm(int32 dim) {
262+
BaseFloat time_in_secs = 0.025;
263+
int32 group_size = 4;
264+
CuMatrix<Real> M(dim, dim), N(dim, dim / group_size);
265+
M.SetRandn();
266+
Timer tim;
267+
int32 iter = 0;
268+
for (;tim.Elapsed() < time_in_secs; iter++)
269+
N.GroupPnorm(M, 2.0);
270+
271+
BaseFloat fdim = dim;
272+
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
273+
KALDI_LOG << "For CuMatrix::GroupPnorm" << NameOf<Real>() << ", for dim = "
274+
<< dim << ", speed was " << gflops << " gigaflops.";
275+
}
276+
277+
template<typename Real> void TestCuMatrixGroupPnormDeriv(int32 dim) {
278+
BaseFloat time_in_secs = 0.025;
279+
int32 group_size = 4;
280+
CuMatrix<Real> M(dim, dim), N(dim, dim / group_size), O(dim, dim);
281+
M.SetRandn();
282+
N.GroupPnorm(M, 2.0);
283+
Timer tim;
284+
int32 iter = 0;
285+
286+
for (;tim.Elapsed() < time_in_secs; iter++)
287+
O.GroupPnormDeriv(M, N, 2.0);
288+
289+
BaseFloat fdim = dim;
290+
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
291+
KALDI_LOG << "For CuMatrix::GroupPnormDeriv" << NameOf<Real>() << ", for dim = "
292+
<< dim << ", speed was " << gflops << " gigaflops.";
293+
}
294+
295+
240296
template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
241297
for (int32 n = 0; n < 2; n++) {
242298
MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
@@ -388,6 +444,10 @@ template<typename Real> void CudaMatrixSpeedTest() {
388444
int32 ns = sizes.size();
389445
for (int32 s = 0; s < ns; s++)
390446
TestCuMatrixMatMat<Real>(sizes[s]);
447+
for (int32 s = 0; s < ns; s++) {
448+
TestCuMatrixAddDiagVecMat<Real>(sizes[s], kNoTrans);
449+
TestCuMatrixAddDiagVecMat<Real>(sizes[s], kTrans);
450+
}
391451
for (int32 s = 0; s < ns; s++)
392452
TestSymInvertPosDef<Real>(sizes[s]);
393453
for (int32 s = 0; s < ns; s++)
@@ -402,6 +462,10 @@ template<typename Real> void CudaMatrixSpeedTest() {
402462
TestCuMatrixMulRowsGroupMat<Real>(sizes[s]);
403463
for (int32 s = 0; s < ns; s++)
404464
TestCuMatrixSoftmax<Real>(sizes[s]);
465+
for (int32 s = 0; s < ns; s++)
466+
TestCuMatrixGroupPnorm<Real>(sizes[s]);
467+
for (int32 s = 0; s < ns; s++)
468+
TestCuMatrixGroupPnormDeriv<Real>(sizes[s]);
405469
for (int32 s = 0; s < ns; s++)
406470
TestCuMatrixTraceMatMat<Real>(sizes[s]);
407471
for (int32 s = 0; s < ns; s++)

src/cudamatrix/cu-matrix.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -988,13 +988,13 @@ void CuMatrixBase<Real>::AddDiagVecMat(
988988

989989
Timer tim;
990990
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
991-
// Caution, this dimGrid is not the same way around as much of the other
992-
// code: going forward, I want to use the (rows, cols) order.
993-
dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK));
994991

995-
MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
996-
if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
992+
dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
993+
n_blocks(num_rows_, CU2DBLOCK));
997994

995+
MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
996+
if (transM == kTrans)
997+
std::swap(M_row_stride, M_col_stride);
998998
cuda_add_diag_vec_mat(dimGrid, dimBlock, alpha, data_, Dim(),
999999
v.Data(), M.Data(), M_row_stride, M_col_stride, beta);
10001000
CU_SAFE_CALL(cudaGetLastError());

src/cudamatrix/cu-matrix.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ class CuMatrixBase {
119119
const CuMatrixBase<Real> &B,
120120
MatrixTransposeType trans);
121121

122+
/// Adds "value" to the diagonal elements of the matrix. The matrix
123+
/// *this does not have to be square.
122124
void AddToDiag(Real value);
123125

124126
/// Dimensions
@@ -183,6 +185,8 @@ class CuMatrixBase {
183185

184186
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
185187
/// where G = x.NumCols() / y.NumCols() must be an integer.
188+
/// [note: y corresponds to *this and x to src, so
189+
/// src.NumCols() / this->NumCols() must be an integer.
186190
void GroupPnorm(const CuMatrixBase<Real> &src, Real pow);
187191

188192
/// Calculate derivatives for the GroupPnorm function above...

src/cudamatrix/cu-sp-matrix-speed-test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ std::string NameOf() {
4141

4242
template<typename Real>
4343
static void UnitTestCuSpMatrixInvert(int32 dim) {
44-
BaseFloat time_in_secs = 0.5;
44+
BaseFloat time_in_secs = 0.01;
4545
int32 iter = 0;
4646
Timer tim;
4747
CuSpMatrix<Real> A(dim);
@@ -82,7 +82,7 @@ static void UnitTestCuSpMatrixInvert(int32 dim) {
8282

8383
template<typename Real>
8484
static void UnitTestCuSpMatrixCopyFromMat(int32 dim, SpCopyType copy_type) {
85-
BaseFloat time_in_secs = 0.1;
85+
BaseFloat time_in_secs = 0.05;
8686
int32 iter = 0;
8787
Timer tim;
8888
CuMatrix<Real> A(dim, dim);

src/cudamatrix/cu-vector-speed-test.cc

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -96,23 +96,50 @@ template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
9696

9797

9898

99-
template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
99+
template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim,
100+
MatrixTransposeType transN,
101+
MatrixTransposeType transO) {
100102
BaseFloat time_in_secs = 0.05;
101103
CuVector<Real> v(dim);
102104
v.SetRandn();
103105
CuMatrix<Real> N(dim, dim), O(dim, dim);
104-
N.SetRandn(); O.SetRandn();
106+
N.SetRandn();
107+
O.SetRandn();
105108

106109
Timer tim;
107110
int32 iter = 0;
108111

109112
for (;tim.Elapsed() < time_in_secs; iter++) {
110-
v.AddDiagMatMat(1.0, N, kNoTrans, O, kNoTrans, 1.0);
113+
v.AddDiagMatMat(1.0, N, transN, O, transO, 1.0);
111114
}
112115

113116
BaseFloat fdim = dim;
114117
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
115-
KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>() << ", for dim = "
118+
KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>()
119+
<< (transN == kNoTrans ? "[no-trans],":"[trans],")
120+
<< (transO == kNoTrans ? "[no-trans],":"[trans],")
121+
<< " for dim = "<< dim << ", speed was " << gflops << " gigaflops.";
122+
}
123+
124+
125+
template<typename Real> void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeType trans) {
126+
BaseFloat time_in_secs = 0.05;
127+
CuVector<Real> v(dim);
128+
v.SetRandn();
129+
CuMatrix<Real> N(dim, dim);
130+
N.SetRandn();
131+
132+
Timer tim;
133+
int32 iter = 0;
134+
135+
for (;tim.Elapsed() < time_in_secs; iter++) {
136+
v.AddDiagMat2(1.0, N, trans, 0.0);
137+
}
138+
139+
BaseFloat fdim = dim;
140+
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
141+
KALDI_LOG << "For CuVector::AddDiagMat2" << NameOf<Real>()
142+
<< (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = "
116143
<< dim << ", speed was " << gflops << " gigaflops.";
117144
}
118145

@@ -121,25 +148,27 @@ template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
121148
template<typename Real> void CudaVectorSpeedTest() {
122149
std::vector<int32> sizes;
123150
sizes.push_back(16);
151+
sizes.push_back(32);
152+
sizes.push_back(64);
124153
sizes.push_back(128);
125154
sizes.push_back(256);
126155
sizes.push_back(1024);
127156
int32 ns = sizes.size();
157+
for (int32 s = 0; s < ns; s++)
158+
TestCuVectorSoftmax<Real>(sizes[s]);
159+
for (int32 s = 0; s < ns; s++)
160+
TestCuVectorSum<Real>(sizes[s]);
161+
for (int32 s = 0; s < ns; s++)
162+
TestCuVectorVecVecOne<Real>(sizes[s]);
128163
for (int32 s = 0; s < ns; s++) {
129-
TestCuVectorSoftmax<Real>(sizes[s]);
130-
}
131-
132-
133-
for (int32 s = 0; s < ns; s++) {
134-
TestCuVectorSum<Real>(sizes[s]);
135-
}
136-
137-
for (int32 s = 0; s < ns; s++) {
138-
TestCuVectorVecVecOne<Real>(sizes[s]);
164+
TestCuVectorAddDiagMatMat<Real>(sizes[s], kNoTrans, kNoTrans);
165+
TestCuVectorAddDiagMatMat<Real>(sizes[s], kNoTrans, kTrans);
166+
TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kNoTrans);
167+
TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kTrans);
139168
}
140-
141-
for (int32 s = 0; s < ns; s++) {
142-
TestCuVectorAddDiagMatMat<Real>(sizes[s]);
169+
for (int32 s = 0; s < ns; s++) {
170+
TestCuVectorAddDiagMat2<Real>(sizes[s], kNoTrans);
171+
TestCuVectorAddDiagMat2<Real>(sizes[s], kTrans);
143172
}
144173

145174
}

0 commit comments

Comments
 (0)