This repository was archived by the owner on Aug 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 89
Expand file tree
/
Copy pathexample-sycl-application.cpp
More file actions
119 lines (107 loc) · 4.66 KB
/
Copy pathexample-sycl-application.cpp
File metadata and controls
119 lines (107 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/***************************************************************************
*
* Copyright (C) 2016 Codeplay Software Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* For your convenience, a copy of the License has been included in this
* repository.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Codeplay's ComputeCpp SDK
*
* example-sycl-application.cpp
*
* Description:
* Sample code that walks through the basics of executing a matrix
* add in SYCL.
*
**************************************************************************/
#include <CL/sycl.hpp>
#include <iostream>
using namespace cl::sycl;
const size_t N = 100;
const size_t M = 150;
class init_a;
class init_b;
class matrix_add;
/* This sample creates three device-only arrays, then initialises them
* on the device. After that, it adds two of them together, storing the
* result in the third buffer. It then verifies the result of the kernel
* on the host by using a host accessor to gain access to the data. */
int main() {
/* Destroying SYCL buffers blocks until all work associated with those
* objects is completed. */
{
queue myQueue;
/* Create device-only 2D buffers of floats for the matrices. */
buffer<float, 2> a(range<2>{N, M});
buffer<float, 2> b(range<2>{N, M});
buffer<float, 2> c(range<2>{N, M});
/* This kernel enqueue will initialise buffer a. The accessor "A" has
* write access. */
myQueue.submit([&](handler& cgh) {
auto A = a.get_access<access::mode::write>(cgh);
cgh.parallel_for<init_a>(range<2>{N, M}, [=](id<2> index) {
A[index] = index[0] * 2 + index[1];
});
});
/* This kernel enqueue will likewise initialise buffer b. The only
* accessor it specifies is a write accessor to b, so the runtime
* can use this information to recognise that these kernels are
* actually independent of each other. Therefore, they can be enqueued
* to the device with no dependencies between each other. */
myQueue.submit([&](handler& cgh) {
auto B = b.get_access<access::mode::write>(cgh);
cgh.parallel_for<init_b>(range<2>{N, M}, [=](id<2> index) {
B[index] = index[0] * 2014 + index[1] * 42;
});
});
/* This kernel will actually perform the computation C = A * B. Since
* A and B are only read from, we specify read accessors for those two
* buffers, which the SYCL runtime recognises as a dependency on the
* previous kernels. If the data were initialised on a different device,
* or on the host, the SYCL runtime would ensure that the data were
* copied between contexts etc. properly. */
myQueue.submit([&](handler& cgh) {
auto A = a.get_access<access::mode::read>(cgh);
auto B = b.get_access<access::mode::read>(cgh);
auto C = c.get_access<access::mode::write>(cgh);
cgh.parallel_for<matrix_add>(
range<2>{N, M}, [=](id<2> index) { C[index] = A[index] + B[index]; });
});
/* A host accessor will copy data from the device and, under most
* circumstances, allocate space for it for the user (it will not
* allocate space when the map_allocator is used and an initial host
* pointer is provided, as this instructs the runtime to map the data
* into the host's memory). Since this code is attempting to access
* buffer c, which had write access in the third kernel, the device is
* assumed to have the most recent copy. Therefore, the runtime will
* wait for the device to finish executing the third kernel before
* copying data from the device to the host. Because it is read only,
* were we to use buffer c on the device again, no copy would be issued
* (and in fact, the operator[]() exposed here does not return an lvalue,
* and cannot be assigned to).*/
auto C = c.get_access<access::mode::read>();
std::cout << "Result:" << std::endl;
for (size_t i = 0; i < N; i++) {
for (size_t j = 0; j < M; j++) {
if (C[i][j] != i * (2 + 2014) + j * (1 + 42)) {
std::cout << "Wrong value " << C[i][j] << " for element " << i << " "
<< j << std::endl;
return -1;
}
}
}
}
std::cout << "Good computation!" << std::endl;
return 0;
}