forked from taskflow/taskflow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranspose.hpp
More file actions
41 lines (32 loc) · 975 Bytes
/
Copy pathtranspose.hpp
File metadata and controls
41 lines (32 loc) · 975 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#pragma once
#include "../cuda_error.hpp"
namespace tf {
// ----------------------------------------------------------------------------
// row-wise matrix transpose
// ----------------------------------------------------------------------------
//
template <typename T>
__global__ void cuda_transpose(
const T* d_in,
T* d_out,
size_t rows,
size_t cols
) {
__shared__ T tile[32][32];
size_t x = blockIdx.x * 32 + threadIdx.x;
size_t y = blockIdx.y * 32 + threadIdx.y;
for(size_t i = 0; i < 32; i += 8) {
if(x < cols && (y + i) < rows) {
tile[threadIdx.y + i][threadIdx.x] = d_in[(y + i) * cols + x];
}
}
__syncthreads();
x = blockIdx.y * 32 + threadIdx.x;
y = blockIdx.x * 32 + threadIdx.y;
for(size_t i = 0; i < 32; i += 8) {
if(x < rows && (y + i) < cols) {
d_out[(y + i) * rows + x] = tile[threadIdx.x][threadIdx.y + i];
}
}
}
} // end of namespace --------------------------------------------------------