kfd_flat_memory.c source code [linux/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c]

1	// SPDX-License-Identifier: GPL-2.0 OR MIT
2	/*
3	* Copyright 2014-2022 Advanced Micro Devices, Inc.
4	*
5	* Permission is hereby granted, free of charge, to any person obtaining a
6	* copy of this software and associated documentation files (the "Software"),
7	* to deal in the Software without restriction, including without limitation
8	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9	* and/or sell copies of the Software, and to permit persons to whom the
10	* Software is furnished to do so, subject to the following conditions:
11	*
12	* The above copyright notice and this permission notice shall be included in
13	* all copies or substantial portions of the Software.
14	*
15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18	* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21	* OTHER DEALINGS IN THE SOFTWARE.
22	*
23	*/
24
25	#include <linux/device.h>
26	#include <linux/err.h>
27	#include <linux/fs.h>
28	#include <linux/sched.h>
29	#include <linux/slab.h>
30	#include <linux/uaccess.h>
31	#include <linux/compat.h>
32	#include <uapi/linux/kfd_ioctl.h>
33	#include <linux/time.h>
34	#include "kfd_priv.h"
35	#include <linux/mm.h>
36	#include <linux/mman.h>
37	#include <linux/processor.h>
38	#include "amdgpu_vm.h"
39
40	/*
41	* The primary memory I/O features being added for revisions of gfxip
42	* beyond 7.0 (Kaveri) are:
43	*
44	* Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b
45	*
46	* “Flat” shader memory access – These are new shader vector memory
47	* operations that do not reference a T#/V# so a “pointer” is what is
48	* sourced from the vector gprs for direct access to memory.
49	* This pointer space has the Shared(LDS) and Private(Scratch) memory
50	* mapped into this pointer space as apertures.
51	* The hardware then determines how to direct the memory request
52	* based on what apertures the request falls in.
53	*
54	* Unaligned support and alignment check
55	*
56	*
57	* System Unified Address - SUA
58	*
59	* The standard usage for GPU virtual addresses are that they are mapped by
60	* a set of page tables we call GPUVM and these page tables are managed by
61	* a combination of vidMM/driver software components. The current virtual
62	* address (VA) range for GPUVM is 40b.
63	*
64	* As of gfxip7.1 and beyond we’re adding the ability for compute memory
65	* clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access
66	* the same page tables used by host x86 processors and that are managed by
67	* the operating system. This is via a technique and hardware called ATC/IOMMU.
68	* The GPU has the capability of accessing both the GPUVM and ATC address
69	* spaces for a given VMID (process) simultaneously and we call this feature
70	* system unified address (SUA).
71	*
72	* There are three fundamental address modes of operation for a given VMID
73	* (process) on the GPU:
74	*
75	* HSA64 – 64b pointers and the default address space is ATC
76	* HSA32 – 32b pointers and the default address space is ATC
77	* GPUVM – 64b pointers and the default address space is GPUVM (driver
78	* model mode)
79	*
80	*
81	* HSA64 - ATC/IOMMU 64b
82	*
83	* A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized
84	* by the CPU so an AMD CPU can only access the high area
85	* (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space
86	* so the actual VA carried to translation is 48b. There is a “hole” in
87	* the middle of the 64b VA space.
88	*
89	* The GPU not only has access to all of the CPU accessible address space via
90	* ATC/IOMMU, but it also has access to the GPUVM address space. The “system
91	* unified address” feature (SUA) is the mapping of GPUVM and ATC address
92	* spaces into a unified pointer space. The method we take for 64b mode is
93	* to map the full 40b GPUVM address space into the hole of the 64b address
94	* space.
95
96	* The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we
97	* direct requests to be translated via GPUVM page tables instead of the
98	* IOMMU path.
99	*
100	*
101	* 64b to 49b Address conversion
102	*
103	* Note that there are still significant portions of unused regions (holes)
104	* in the 64b address space even for the GPU. There are several places in
105	* the pipeline (sw and hw), we wish to compress the 64b virtual address
106	* to a 49b address. This 49b address is constituted of an “ATC” bit
107	* plus a 48b virtual address. This 49b address is what is passed to the
108	* translation hardware. ATC==0 means the 48b address is a GPUVM address
109	* (max of 2^40 – 1) intended to be translated via GPUVM page tables.
110	* ATC==1 means the 48b address is intended to be translated via IOMMU
111	* page tables.
112	*
113	* A 64b pointer is compared to the apertures that are defined (Base/Limit), in
114	* this case the GPUVM aperture (red) is defined and if a pointer falls in this
115	* aperture, we subtract the GPUVM_Base address and set the ATC bit to zero
116	* as part of the 64b to 49b conversion.
117	*
118	* Where this 64b to 49b conversion is done is a function of the usage.
119	* Most GPU memory access is via memory objects where the driver builds
120	* a descriptor which consists of a base address and a memory access by
121	* the GPU usually consists of some kind of an offset or Cartesian coordinate
122	* that references this memory descriptor. This is the case for shader
123	* instructions that reference the T# or V# constants, or for specified
124	* locations of assets (ex. the shader program location). In these cases
125	* the driver is what handles the 64b to 49b conversion and the base
126	* address in the descriptor (ex. V# or T# or shader program location)
127	* is defined as a 48b address w/ an ATC bit. For this usage a given
128	* memory object cannot straddle multiple apertures in the 64b address
129	* space. For example a shader program cannot jump in/out between ATC
130	* and GPUVM space.
131	*
132	* In some cases we wish to pass a 64b pointer to the GPU hardware and
133	* the GPU hw does the 64b to 49b conversion before passing memory
134	* requests to the cache/memory system. This is the case for the
135	* S_LOAD and FLAT_* shader memory instructions where we have 64b pointers
136	* in scalar and vector GPRs respectively.
137	*
138	* In all cases (no matter where the 64b -> 49b conversion is done), the gfxip
139	* hardware sends a 48b address along w/ an ATC bit, to the memory controller
140	* on the memory request interfaces.
141	*
142	* <client>_MC_rdreq_atc // read request ATC bit
143	*
144	* 0 : <client>_MC_rdreq_addr is a GPUVM VA
145	*
146	* 1 : <client>_MC_rdreq_addr is a ATC VA
147	*
148	*
149	* “Spare” aperture (APE1)
150	*
151	* We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use
152	* apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the
153	* config tables for setting cache policies. The “spare” (APE1) aperture is
154	* motivated by getting a different Mtype from the default.
155	* The default aperture isn’t an actual base/limit aperture; it is just the
156	* address space that doesn’t hit any defined base/limit apertures.
157	* The following diagram is a complete picture of the gfxip7.x SUA apertures.
158	* The APE1 can be placed either below or above
159	* the hole (cannot be in the hole).
160	*
161	*
162	* General Aperture definitions and rules
163	*
164	* An aperture register definition consists of a Base, Limit, Mtype, and
165	* usually an ATC bit indicating which translation tables that aperture uses.
166	* In all cases (for SUA and DUA apertures discussed later), aperture base
167	* and limit definitions are 64KB aligned.
168	*
169	* <ape>_Base[63:0] = { <ape>_Base_register[63:16], 0x0000 }
170	*
171	* <ape>_Limit[63:0] = { <ape>_Limit_register[63:16], 0xFFFF }
172	*
173	* The base and limit are considered inclusive to an aperture so being
174	* inside an aperture means (address >= Base) AND (address <= Limit).
175	*
176	* In no case is a payload that straddles multiple apertures expected to work.
177	* For example a load_dword_x4 that starts in one aperture and ends in another,
178	* does not work. For the vector FLAT_* ops we have detection capability in
179	* the shader for reporting a “memory violation” back to the
180	* SQ block for use in traps.
181	* A memory violation results when an op falls into the hole,
182	* or a payload straddles multiple apertures. The S_LOAD instruction
183	* does not have this detection.
184	*
185	* Apertures cannot overlap.
186	*
187	*
188	*
189	* HSA32 - ATC/IOMMU 32b
190	*
191	* For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR
192	* instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b
193	* will not fit so there is only partial visibility to the GPUVM
194	* space (defined by the aperture) for S_LOAD and FLAT_* ops.
195	* There is no spare (APE1) aperture for HSA32 mode.
196	*
197	*
198	* GPUVM 64b mode (driver model)
199	*
200	* This mode is related to HSA64 in that the difference really is that
201	* the default aperture is GPUVM (ATC==0) and not ATC space.
202	* We have gfxip7.x hardware that has FLAT_* and S_LOAD support for
203	* SUA GPUVM mode, but does not support HSA32/HSA64.
204	*
205	*
206	* Device Unified Address - DUA
207	*
208	* Device unified address (DUA) is the name of the feature that maps the
209	* Shared(LDS) memory and Private(Scratch) memory into the overall address
210	* space for use by the new FLAT_* vector memory ops. The Shared and
211	* Private memories are mapped as apertures into the address space,
212	* and the hardware detects when a FLAT_* memory request is to be redirected
213	* to the LDS or Scratch memory when it falls into one of these apertures.
214	* Like the SUA apertures, the Shared/Private apertures are 64KB aligned and
215	* the base/limit is “in” the aperture. For both HSA64 and GPUVM SUA modes,
216	* the Shared/Private apertures are always placed in a limited selection of
217	* options in the hole of the 64b address space. For HSA32 mode, the
218	* Shared/Private apertures can be placed anywhere in the 32b space
219	* except at 0.
220	*
221	*
222	* HSA64 Apertures for FLAT_* vector ops
223	*
224	* For HSA64 SUA mode, the Shared and Private apertures are always placed
225	* in the hole w/ a limited selection of possible locations. The requests
226	* that fall in the private aperture are expanded as a function of the
227	* work-item id (tid) and redirected to the location of the
228	* “hidden private memory”. The hidden private can be placed in either GPUVM
229	* or ATC space. The addresses that fall in the shared aperture are
230	* re-directed to the on-chip LDS memory hardware.
231	*
232	*
233	* HSA32 Apertures for FLAT_* vector ops
234	*
235	* In HSA32 mode, the Private and Shared apertures can be placed anywhere
236	* in the 32b space except at 0 (Private or Shared Base at zero disables
237	* the apertures). If the base address of the apertures are non-zero
238	* (ie apertures exists), the size is always 64KB.
239	*
240	*
241	* GPUVM Apertures for FLAT_* vector ops
242	*
243	* In GPUVM mode, the Shared/Private apertures are specified identically
244	* to HSA64 mode where they are always in the hole at a limited selection
245	* of locations.
246	*
247	*
248	* Aperture Definitions for SUA and DUA
249	*
250	* The interpretation of the aperture register definitions for a given
251	* VMID is a function of the “SUA Mode” which is one of HSA64, HSA32, or
252	* GPUVM64 discussed in previous sections. The mode is first decoded, and
253	* then the remaining register decode is a function of the mode.
254	*
255	*
256	* SUA Mode Decode
257	*
258	* For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from
259	* the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and
260	* the SH_MEM_CONFIG:PTR32 bits.
261	*
262	* COMPUTE_DISPATCH_INITIATOR:DATA_ATC SH_MEM_CONFIG:PTR32 Mode
263	*
264	* 1 0 HSA64
265	*
266	* 1 1 HSA32
267	*
268	* 0 X GPUVM64
269	*
270	* In general the hardware will ignore the PTR32 bit and treat
271	* as “0” whenever DATA_ATC = “0”, but sw should set PTR32=0
272	* when DATA_ATC=0.
273	*
274	* The DATA_ATC bit is only set for compute dispatches.
275	* All “Draw” dispatches are hardcoded to GPUVM64 mode
276	* for FLAT_* / S_LOAD operations.
277	*/
278
279	#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \
280	(((uint64_t)(gpu_num) << 61) + 0x1000000000000L)
281
282	#define MAKE_GPUVM_APP_LIMIT(base, size) \
283	(((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1)
284
285	#define MAKE_SCRATCH_APP_BASE_VI() \
286	(((uint64_t)(0x1UL) << 61) + 0x100000000L)
287
288	#define MAKE_SCRATCH_APP_LIMIT(base) \
289	(((uint64_t)base & 0xFFFFFFFF00000000UL) \| 0xFFFFFFFF)
290
291	#define MAKE_LDS_APP_BASE_VI() \
292	(((uint64_t)(0x1UL) << 61) + 0x0)
293	#define MAKE_LDS_APP_LIMIT(base) \
294	(((uint64_t)(base) & 0xFFFFFFFF00000000UL) \| 0xFFFFFFFF)
295
296	/ On GFXv9 the LDS and scratch apertures are programmed independently*
297	* using the high 16 bits of the 64-bit virtual address. They must be
298	* in the hole, which will be the case as long as the high 16 bits are
299	* not 0.
300	*
301	* The aperture sizes are still 4GB implicitly.
302	*
303	* A GPUVM aperture is not applicable on GFXv9.
304	*/
305	#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48)
306	#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48)
307
308	/ User mode manages most of the SVM aperture address space. The low*
309	* 16MB are reserved for kernel use (CWSR trap handler and kernel IB
310	* for now).
311	*/
312	#define SVM_USER_BASE (u64)(KFD_CWSR_TBA_TMA_SIZE + 2*PAGE_SIZE)
313	#define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE)
314	#define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE)
315
316	static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id)
317	{
318	/*
319	* node id couldn't be 0 - the three MSB bits of
320	* aperture shouldn't be 0
321	*/
322	pdd->lds_base = MAKE_LDS_APP_BASE_VI();
323	pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
324
325	/ dGPUs: SVM aperture starting at 0*
326	* with small reserved space for kernel.
327	* Set them to CANONICAL addresses.
328	*/
329	pdd->gpuvm_base = max(SVM_USER_BASE, AMDGPU_VA_RESERVED_BOTTOM);
330	pdd->gpuvm_limit =
331	pdd->dev->kfd->shared_resources.gpuvm_size - `1`;
332
333	/ dGPUs: the reserved space for kernel*
334	* before SVM
335	*/
336	pdd->qpd.cwsr_base = SVM_CWSR_BASE;
337	pdd->qpd.ib_base = SVM_IB_BASE;
338
339	pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI();
340	pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
341	}
342
343	static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id)
344	{
345	pdd->lds_base = MAKE_LDS_APP_BASE_V9();
346	pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
347
348	pdd->gpuvm_base = AMDGPU_VA_RESERVED_BOTTOM;
349	pdd->gpuvm_limit =
350	pdd->dev->kfd->shared_resources.gpuvm_size - `1`;
351
352	pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9();
353	pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
354
355	/*
356	* Place TBA/TMA on opposite side of VM hole to prevent
357	* stray faults from triggering SVM on these pages.
358	*/
359	pdd->qpd.cwsr_base = AMDGPU_VA_RESERVED_TRAP_START(pdd->dev->adev);
360	}
361
362	int kfd_init_apertures(struct kfd_process *process)
363	{
364	uint8_t id = `0`;
365	struct kfd_node *dev;
366	struct kfd_process_device *pdd;
367
368	/Iterating over all devices/
369	while (kfd_topology_enum_kfd_devices(idx: id, kdev: &dev) == `0`) {
370	if (!dev \|\| kfd_devcgroup_check_permission(node: dev)) {
371	/ Skip non GPU devices and devices to which the*
372	* current process have no access to. Access can be
373	* limited by placing the process in a specific
374	* cgroup hierarchy
375	*/
376	id++;
377	continue;
378	}
379
380	pdd = kfd_create_process_device_data(dev, p: process);
381	if (!pdd) {
382	dev_err(dev->adev->dev,
383	"Failed to create process device data\n");
384	return -ENOMEM;
385	}
386	/*
387	* For 64 bit process apertures will be statically reserved in
388	* the x86_64 non canonical process address space
389	* amdkfd doesn't currently support apertures for 32 bit process
390	*/
391	if (process->is_32bit_user_mode) {
392	pdd->lds_base = pdd->lds_limit = `0`;
393	pdd->gpuvm_base = pdd->gpuvm_limit = `0`;
394	pdd->scratch_base = pdd->scratch_limit = `0`;
395	} else {
396	switch (dev->adev->asic_type) {
397	case CHIP_KAVERI:
398	case CHIP_HAWAII:
399	case CHIP_CARRIZO:
400	case CHIP_TONGA:
401	case CHIP_FIJI:
402	case CHIP_POLARIS10:
403	case CHIP_POLARIS11:
404	case CHIP_POLARIS12:
405	case CHIP_VEGAM:
406	kfd_init_apertures_vi(pdd, id);
407	break;
408	default:
409	if (KFD_GC_VERSION(dev) >= IP_VERSION(`9`, `0`, `1`))
410	kfd_init_apertures_v9(pdd, id);
411	else {
412	WARN(`1`, "Unexpected ASIC family %u",
413	dev->adev->asic_type);
414	return -EINVAL;
415	}
416	}
417	}
418
419	dev_dbg(kfd_device, "node id %u\n", id);
420	dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id);
421	dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base);
422	dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit);
423	dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base);
424	dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit);
425	dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base);
426	dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit);
427
428	id++;
429	}
430
431	return `0`;
432	}
433

source code of linux/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c