This repository was archived by the owner on Aug 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 89
Expand file tree
/
Copy pathvptr.cpp
More file actions
120 lines (106 loc) · 4.54 KB
/
Copy pathvptr.cpp
File metadata and controls
120 lines (106 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/***************************************************************************
*
* Copyright (C) 2017 Codeplay Software Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* For your convenience, a copy of the License has been included in this
* repository.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Codeplay's ComputeCpp SDK
*
* example_vptr.cpp
*
* Description:
* Sample code that demonstrates the use of the virtual pointer interface in
* SYCL on matrix addition.
*
**************************************************************************/
#include <CL/sycl.hpp>
#include <iostream>
#include <vptr/virtual_ptr.hpp>
using namespace cl::sycl;
class init_a;
class init_b;
class matrix_add;
const size_t N = 100;
const size_t M = 150;
/* This sample allocates three device-only matrices, using the virtual pointer
* and SYCLmalloc. It initalises the first two in parallel on the device. After
* that, it adds them together, storing the result in the third matrix. It then
* verifies the result on the host by using:
* - pointer arithmetic on the virtual pointer to index the matrix
* - host accessor to gain access to the data. */
int main() {
{
queue myQueue;
vptr::PointerMapper pMap;
/* Allocate the matrices using SYCLmalloc. a, b and c are virtual pointers,
* pointing to device buffers.
*/
float* a = static_cast<float*>(SYCLmalloc(N * M * sizeof(float), pMap));
float* b = static_cast<float*>(SYCLmalloc(N * M * sizeof(float), pMap));
float* c = static_cast<float*>(SYCLmalloc(N * M * sizeof(float), pMap));
/* This kernel will initialise the buffer pointed to by a. The accessor "A"
* has write access. We retrieve it directly from the PointerMapper, using
* the virtual pointer.*/
myQueue.submit([&](handler& cgh) {
auto accA = pMap.get_access<access::mode::discard_write,
access::target::global_buffer, float>(a, cgh);
cgh.parallel_for<init_a>(
range<1>{N * M}, [=](item<1> index) { accA[index] = index[0] * 2; });
});
/* Similarly, this kernel will initialise the buffer pointed to by b. */
myQueue.submit([&](handler& cgh) {
auto accB = pMap.get_access<access::mode::discard_write,
access::target::global_buffer, float>(b, cgh);
cgh.parallel_for<init_b>(range<1>{N * M}, [=](item<1> index) {
accB[index] = index[0] * 2014;
});
});
/* This kernel will perform the computation c = a + b. */
myQueue.submit([&](handler& cgh) {
auto accA = pMap.get_access<access::mode::read,
access::target::global_buffer, float>(a, cgh);
auto accB = pMap.get_access<access::mode::read,
access::target::global_buffer, float>(b, cgh);
auto accC = pMap.get_access<access::mode::discard_write,
access::target::global_buffer, float>(c, cgh);
cgh.parallel_for<matrix_add>(range<1>{N * M}, [=](item<1> index) {
accC[index] = accA[index] + accB[index];
});
});
/* On the host, the result stored in the buffer of virtual pointer "c" are
* checked. The matrix is accessed row by row, using pointer arithmetics on
* the virtual pointer. */
auto c_row = c;
for (size_t i = 0; i < N; i++) {
/* Get the number of elements by which the row is offset. */
auto row_offset = pMap.get_element_offset<float>(c_row);
/* Create a host accessor to access the data on the host. */
auto accC = pMap.get_access<access::mode::read,
access::target::host_buffer, float>(c_row);
for (size_t j = 0; j < M; j++) {
if (accC[row_offset + j] != (i * M + j) * (2 + 2014)) {
std::cout << "Wrong value " << accC[row_offset + j] << " for element "
<< i * M + j << std::endl;
return -1;
}
c_row++;
}
}
/* End scope of myQueue, this waits for any remaining operations on the
* queue to complete. */
}
std::cout << "Good computation!" << std::endl;
return 0;
}