intel_migrate.c source code [linux/drivers/gpu/drm/i915/gt/intel_migrate.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2020 Intel Corporation
4	*/
5
6	#include "i915_drv.h"
7	#include "intel_context.h"
8	#include "intel_gpu_commands.h"
9	#include "intel_gt.h"
10	#include "intel_gtt.h"
11	#include "intel_migrate.h"
12	#include "intel_ring.h"
13	#include "gem/i915_gem_lmem.h"
14
15	struct insert_pte_data {
16	u64 offset;
17	};
18
19	#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
20
21	#define GET_CCS_BYTES(i915, size) (HAS_FLAT_CCS(i915) ? \
22	DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0)
23	static bool engine_supports_migration(struct intel_engine_cs *engine)
24	{
25	if (!engine)
26	return false;
27
28	/*
29	* We need the ability to prevent aribtration (MI_ARB_ON_OFF),
30	* the ability to write PTE using inline data (MI_STORE_DATA)
31	* and of course the ability to do the block transfer (blits).
32	*/
33	GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
34
35	return true;
36	}
37
38	static void xehp_toggle_pdes(struct i915_address_space *vm,
39	struct i915_page_table *pt,
40	void *data)
41	{
42	struct insert_pte_data *d = data;
43
44	/*
45	* Insert a dummy PTE into every PT that will map to LMEM to ensure
46	* we have a correctly setup PDE structure for later use.
47	*/
48	vm->insert_page(vm, `0`, d->offset,
49	i915_gem_get_pat_index(i915: vm->i915, level: I915_CACHE_NONE),
50	PTE_LM);
51	GEM_BUG_ON(!pt->is_compact);
52	d->offset += SZ_2M;
53	}
54
55	static void xehp_insert_pte(struct i915_address_space *vm,
56	struct i915_page_table *pt,
57	void *data)
58	{
59	struct insert_pte_data *d = data;
60
61	/*
62	* We are playing tricks here, since the actual pt, from the hw
63	* pov, is only 256bytes with 32 entries, or 4096bytes with 512
64	* entries, but we are still guaranteed that the physical
65	* alignment is 64K underneath for the pt, and we are careful
66	* not to access the space in the void.
67	*/
68	vm->insert_page(vm, px_dma(pt), d->offset,
69	i915_gem_get_pat_index(i915: vm->i915, level: I915_CACHE_NONE),
70	PTE_LM);
71	d->offset += SZ_64K;
72	}
73
74	static void insert_pte(struct i915_address_space *vm,
75	struct i915_page_table *pt,
76	void *data)
77	{
78	struct insert_pte_data *d = data;
79
80	vm->insert_page(vm, px_dma(pt), d->offset,
81	i915_gem_get_pat_index(i915: vm->i915, level: I915_CACHE_NONE),
82	i915_gem_object_is_lmem(obj: pt->base) ? PTE_LM : `0`);
83	d->offset += PAGE_SIZE;
84	}
85
86	static struct i915_address_space migrate_vm(struct* intel_gt *gt)
87	{
88	struct i915_vm_pt_stash stash = {};
89	struct i915_ppgtt *vm;
90	int err;
91	int i;
92
93	/*
94	* We construct a very special VM for use by all migration contexts,
95	* it is kept pinned so that it can be used at any time. As we need
96	* to pre-allocate the page directories for the migration VM, this
97	* limits us to only using a small number of prepared vma.
98	*
99	* To be able to pipeline and reschedule migration operations while
100	* avoiding unnecessary contention on the vm itself, the PTE updates
101	* are inline with the blits. All the blits use the same fixed
102	* addresses, with the backing store redirection being updated on the
103	* fly. Only 2 implicit vma are used for all migration operations.
104	*
105	* We lay the ppGTT out as:
106	*
107	* [0, CHUNK_SZ) -> first object
108	* [CHUNK_SZ, 2 * CHUNK_SZ) -> second object
109	* [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
110	*
111	* By exposing the dma addresses of the page directories themselves
112	* within the ppGTT, we are then able to rewrite the PTE prior to use.
113	* But the PTE update and subsequent migration operation must be atomic,
114	* i.e. within the same non-preemptible window so that we do not switch
115	* to another migration context that overwrites the PTE.
116	*
117	* This changes quite a bit on platforms with HAS_64K_PAGES support,
118	* where we instead have three windows, each CHUNK_SIZE in size. The
119	* first is reserved for mapping system-memory, and that just uses the
120	* 512 entry layout using 4K GTT pages. The other two windows just map
121	* lmem pages and must use the new compact 32 entry layout using 64K GTT
122	* pages, which ensures we can address any lmem object that the user
123	* throws at us. We then also use the xehp_toggle_pdes as a way of
124	* just toggling the PDE bit(GEN12_PDE_64K) for us, to enable the
125	* compact layout for each of these page-tables, that fall within the
126	* [CHUNK_SIZE, 3 * CHUNK_SIZE) range.
127	*
128	* We lay the ppGTT out as:
129	*
130	* [0, CHUNK_SZ) -> first window/object, maps smem
131	* [CHUNK_SZ, 2 * CHUNK_SZ) -> second window/object, maps lmem src
132	* [2 * CHUNK_SZ, 3 * CHUNK_SZ) -> third window/object, maps lmem dst
133	*
134	* For the PTE window it's also quite different, since each PTE must
135	* point to some 64K page, one for each PT(since it's in lmem), and yet
136	* each is only <= 4096bytes, but since the unused space within that PTE
137	* range is never touched, this should be fine.
138	*
139	* So basically each PT now needs 64K of virtual memory, instead of 4K,
140	* which looks like:
141	*
142	* [3 * CHUNK_SZ, 3 * CHUNK_SZ + ((3 * CHUNK_SZ / SZ_2M) * SZ_64K)] -> PTE
143	*/
144
145	vm = i915_ppgtt_create(gt, I915_BO_ALLOC_PM_EARLY);
146	if (IS_ERR(ptr: vm))
147	return ERR_CAST(ptr: vm);
148
149	if (!vm->vm.allocate_va_range \|\| !vm->vm.foreach) {
150	err = -ENODEV;
151	goto err_vm;
152	}
153
154	if (HAS_64K_PAGES(gt->i915))
155	stash.pt_sz = I915_GTT_PAGE_SIZE_64K;
156
157	/*
158	* Each engine instance is assigned its own chunk in the VM, so
159	* that we can run multiple instances concurrently
160	*/
161	for (i = `0`; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
162	struct intel_engine_cs *engine;
163	u64 base = (u64)i << `32`;
164	struct insert_pte_data d = {};
165	struct i915_gem_ww_ctx ww;
166	u64 sz;
167
168	engine = gt->engine_class[COPY_ENGINE_CLASS][i];
169	if (!engine_supports_migration(engine))
170	continue;
171
172	/*
173	* We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
174	* 4x2 page directories for source/destination.
175	*/
176	if (HAS_64K_PAGES(gt->i915))
177	sz = `3` * CHUNK_SZ;
178	else
179	sz = `2` * CHUNK_SZ;
180	d.offset = base + sz;
181
182	/*
183	* We need another page directory setup so that we can write
184	* the 8x512 PTE in each chunk.
185	*/
186	if (HAS_64K_PAGES(gt->i915))
187	sz += (sz / SZ_2M) * SZ_64K;
188	else
189	sz += (sz >> `12`) * sizeof(u64);
190
191	err = i915_vm_alloc_pt_stash(vm: &vm->vm, stash: &stash, size: sz);
192	if (err)
193	goto err_vm;
194
195	for_i915_gem_ww(&ww, err, true) {
196	err = i915_vm_lock_objects(vm: &vm->vm, ww: &ww);
197	if (err)
198	continue;
199	err = i915_vm_map_pt_stash(vm: &vm->vm, stash: &stash);
200	if (err)
201	continue;
202
203	vm->vm.allocate_va_range(&vm->vm, &stash, base, sz);
204	}
205	i915_vm_free_pt_stash(vm: &vm->vm, stash: &stash);
206	if (err)
207	goto err_vm;
208
209	/ Now allow the GPU to rewrite the PTE via its own ppGTT /
210	if (HAS_64K_PAGES(gt->i915)) {
211	vm->vm.foreach(&vm->vm, base, d.offset - base,
212	xehp_insert_pte, &d);
213	d.offset = base + CHUNK_SZ;
214	vm->vm.foreach(&vm->vm,
215	d.offset,
216	`2` * CHUNK_SZ,
217	xehp_toggle_pdes, &d);
218	} else {
219	vm->vm.foreach(&vm->vm, base, d.offset - base,
220	insert_pte, &d);
221	}
222	}
223
224	return &vm->vm;
225
226	err_vm:
227	i915_vm_put(vm: &vm->vm);
228	return ERR_PTR(error: err);
229	}
230
231	static struct intel_engine_cs first_copy_engine(struct* intel_gt *gt)
232	{
233	struct intel_engine_cs *engine;
234	int i;
235
236	for (i = `0`; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
237	engine = gt->engine_class[COPY_ENGINE_CLASS][i];
238	if (engine_supports_migration(engine))
239	return engine;
240	}
241
242	return NULL;
243	}
244
245	static struct intel_context pinned_context(struct* intel_gt *gt)
246	{
247	static struct lock_class_key key;
248	struct intel_engine_cs *engine;
249	struct i915_address_space *vm;
250	struct intel_context *ce;
251
252	engine = first_copy_engine(gt);
253	if (!engine)
254	return ERR_PTR(error: -ENODEV);
255
256	vm = migrate_vm(gt);
257	if (IS_ERR(ptr: vm))
258	return ERR_CAST(ptr: vm);
259
260	ce = intel_engine_create_pinned_context(engine, vm, SZ_512K,
261	I915_GEM_HWS_MIGRATE,
262	key: &key, name: "migrate");
263	i915_vm_put(vm);
264	return ce;
265	}
266
267	int intel_migrate_init(struct intel_migrate m, struct* intel_gt *gt)
268	{
269	struct intel_context *ce;
270
271	memset(m, `0`, sizeof(*m));
272
273	ce = pinned_context(gt);
274	if (IS_ERR(ptr: ce))
275	return PTR_ERR(ptr: ce);
276
277	m->context = ce;
278	return `0`;
279	}
280
281	static int random_index(unsigned int max)
282	{
283	return upper_32_bits(mul_u32_u32(get_random_u32(), max));
284	}
285
286	static struct intel_context __migrate_engines(struct* intel_gt *gt)
287	{
288	struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
289	struct intel_engine_cs *engine;
290	unsigned int count, i;
291
292	count = `0`;
293	for (i = `0`; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
294	engine = gt->engine_class[COPY_ENGINE_CLASS][i];
295	if (engine_supports_migration(engine))
296	engines[count++] = engine;
297	}
298
299	return intel_context_create(engine: engines[random_index(max: count)]);
300	}
301
302	struct intel_context intel_migrate_create_context(struct* intel_migrate *m)
303	{
304	struct intel_context *ce;
305
306	/*
307	* We randomly distribute contexts across the engines upon construction,
308	* as they all share the same pinned vm, and so in order to allow
309	* multiple blits to run in parallel, we must construct each blit
310	* to use a different range of the vm for its GTT. This has to be
311	* known at construction, so we can not use the late greedy load
312	* balancing of the virtual-engine.
313	*/
314	ce = __migrate_engines(gt: m->context->engine->gt);
315	if (IS_ERR(ptr: ce))
316	return ce;
317
318	ce->ring = NULL;
319	ce->ring_size = SZ_256K;
320
321	i915_vm_put(vm: ce->vm);
322	ce->vm = i915_vm_get(vm: m->context->vm);
323
324	return ce;
325	}
326
327	static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
328	{
329	dma_addr_t addr = sg_dma_address(sg);
330
331	return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
332	}
333
334	static int emit_no_arbitration(struct i915_request *rq)
335	{
336	u32 *cs;
337
338	cs = intel_ring_begin(rq, num_dwords: `2`);
339	if (IS_ERR(ptr: cs))
340	return PTR_ERR(ptr: cs);
341
342	/ Explicitly disable preemption for this request. /
343	*cs++ = MI_ARB_ON_OFF;
344	*cs++ = MI_NOOP;
345	intel_ring_advance(rq, cs);
346
347	return `0`;
348	}
349
350	static int max_pte_pkt_size(struct i915_request rq, int* pkt)
351	{
352	struct intel_ring *ring = rq->ring;
353
354	pkt = min_t(int, pkt, (ring->space - rq->reserved_space) / sizeof(u32) + `5`);
355	pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + `5`);
356
357	return pkt;
358	}
359
360	#define I915_EMIT_PTE_NUM_DWORDS 6
361
362	static int emit_pte(struct i915_request *rq,
363	struct sgt_dma *it,
364	unsigned int pat_index,
365	bool is_lmem,
366	u64 offset,
367	int length)
368	{
369	bool has_64K_pages = HAS_64K_PAGES(rq->i915);
370	const u64 encode = rq->context->vm->pte_encode(`0`, pat_index,
371	is_lmem ? PTE_LM : `0`);
372	struct intel_ring *ring = rq->ring;
373	int pkt, dword_length;
374	u32 total = `0`;
375	u32 page_size;
376	u32 hdr, cs;
377
378	GEM_BUG_ON(GRAPHICS_VER(rq->i915) < `8`);
379
380	page_size = I915_GTT_PAGE_SIZE;
381	dword_length = `0x400`;
382
383	/ Compute the page directory offset for the target address range /
384	if (has_64K_pages) {
385	GEM_BUG_ON(!IS_ALIGNED(offset, SZ_2M));
386
387	offset /= SZ_2M;
388	offset *= SZ_64K;
389	offset += `3` * CHUNK_SZ;
390
391	if (is_lmem) {
392	page_size = I915_GTT_PAGE_SIZE_64K;
393	dword_length = `0x40`;
394	}
395	} else {
396	offset >>= `12`;
397	offset = sizeof*(u64);
398	offset += `2` * CHUNK_SZ;
399	}
400
401	offset += (u64)rq->engine->instance << `32`;
402
403	cs = intel_ring_begin(rq, I915_EMIT_PTE_NUM_DWORDS);
404	if (IS_ERR(ptr: cs))
405	return PTR_ERR(ptr: cs);
406
407	/ Pack as many PTE updates as possible into a single MI command /
408	pkt = max_pte_pkt_size(rq, pkt: dword_length);
409
410	hdr = cs;
411	cs++ = MI_STORE_DATA_IMM \| REG_BIT(`21`); /* as qword elements /
412	*cs++ = lower_32_bits(offset);
413	*cs++ = upper_32_bits(offset);
414
415	do {
416	if (cs - hdr >= pkt) {
417	int dword_rem;
418
419	*hdr += cs - hdr - `2`;
420	*cs++ = MI_NOOP;
421
422	ring->emit = (void *)cs - ring->vaddr;
423	intel_ring_advance(rq, cs);
424	intel_ring_update_space(ring);
425
426	cs = intel_ring_begin(rq, I915_EMIT_PTE_NUM_DWORDS);
427	if (IS_ERR(ptr: cs))
428	return PTR_ERR(ptr: cs);
429
430	dword_rem = dword_length;
431	if (has_64K_pages) {
432	if (IS_ALIGNED(total, SZ_2M)) {
433	offset = round_up(offset, SZ_64K);
434	} else {
435	dword_rem = SZ_2M - (total & (SZ_2M - `1`));
436	dword_rem /= page_size;
437	dword_rem *= `2`;
438	}
439	}
440
441	pkt = max_pte_pkt_size(rq, pkt: dword_rem);
442
443	hdr = cs;
444	*cs++ = MI_STORE_DATA_IMM \| REG_BIT(`21`);
445	*cs++ = lower_32_bits(offset);
446	*cs++ = upper_32_bits(offset);
447	}
448
449	GEM_BUG_ON(!IS_ALIGNED(it->dma, page_size));
450
451	*cs++ = lower_32_bits(encode \| it->dma);
452	*cs++ = upper_32_bits(encode \| it->dma);
453
454	offset += `8`;
455	total += page_size;
456
457	it->dma += page_size;
458	if (it->dma >= it->max) {
459	it->sg = __sg_next(sg: it->sg);
460	if (!it->sg \|\| sg_dma_len(it->sg) == `0`)
461	break;
462
463	it->dma = sg_dma_address(it->sg);
464	it->max = it->dma + sg_dma_len(it->sg);
465	}
466	} while (total < length);
467
468	*hdr += cs - hdr - `2`;
469	*cs++ = MI_NOOP;
470
471	ring->emit = (void *)cs - ring->vaddr;
472	intel_ring_advance(rq, cs);
473	intel_ring_update_space(ring);
474
475	return total;
476	}
477
478	static bool wa_1209644611_applies(int ver, u32 size)
479	{
480	u32 height = size >> PAGE_SHIFT;
481
482	if (ver != `11`)
483	return false;
484
485	return height % `4` == `3` && height <= `8`;
486	}
487
488	/**
489	* DOC: Flat-CCS - Memory compression for Local memory
490	*
491	* On Xe-HP and later devices, we use dedicated compression control state (CCS)
492	* stored in local memory for each surface, to support the 3D and media
493	* compression formats.
494	*
495	* The memory required for the CCS of the entire local memory is 1/256 of the
496	* local memory size. So before the kernel boot, the required memory is reserved
497	* for the CCS data and a secure register will be programmed with the CCS base
498	* address.
499	*
500	* Flat CCS data needs to be cleared when a lmem object is allocated.
501	* And CCS data can be copied in and out of CCS region through
502	* XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
503	*
504	* I915 supports Flat-CCS on lmem only objects. When an objects has smem in
505	* its preference list, on memory pressure, i915 needs to migrate the lmem
506	* content into smem. If the lmem object is Flat-CCS compressed by userspace,
507	* then i915 needs to decompress it. But I915 lack the required information
508	* for such decompression. Hence I915 supports Flat-CCS only on lmem only objects.
509	*
510	* When we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can
511	* be temporarily evicted to smem, along with the auxiliary CCS state, where
512	* it can be potentially swapped-out at a later point, if required.
513	* If userspace later touches the evicted pages, then we always move
514	* the backing memory back to lmem, which includes restoring the saved CCS state,
515	* and potentially performing any required swap-in.
516	*
517	* For the migration of the lmem objects with smem in placement list, such as
518	* {lmem, smem}, objects are treated as non Flat-CCS capable objects.
519	*/
520
521	static inline u32 i915_flush_dw(u32 cmd, u32 flags)
522	{
523	*cmd++ = MI_FLUSH_DW \| flags;
524	*cmd++ = `0`;
525	*cmd++ = `0`;
526
527	return cmd;
528	}
529
530	static int emit_copy_ccs(struct i915_request *rq,
531	u32 dst_offset, u8 dst_access,
532	u32 src_offset, u8 src_access, int size)
533	{
534	struct drm_i915_private *i915 = rq->i915;
535	int mocs = rq->engine->gt->mocs.uc_index << `1`;
536	u32 num_ccs_blks;
537	u32 *cs;
538
539	cs = intel_ring_begin(rq, num_dwords: `12`);
540	if (IS_ERR(ptr: cs))
541	return PTR_ERR(ptr: cs);
542
543	num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
544	NUM_CCS_BYTES_PER_BLOCK);
545	GEM_BUG_ON(num_ccs_blks > NUM_CCS_BLKS_PER_XFER);
546	cs = i915_flush_dw(cmd: cs, MI_FLUSH_DW_LLC \| MI_FLUSH_DW_CCS);
547
548	/*
549	* The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS
550	* data in and out of the CCS region.
551	*
552	* We can copy at most 1024 blocks of 256 bytes using one
553	* XY_CTRL_SURF_COPY_BLT instruction.
554	*
555	* In case we need to copy more than 1024 blocks, we need to add
556	* another instruction to the same batch buffer.
557	*
558	* 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS.
559	*
560	* 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM.
561	*/
562	*cs++ = XY_CTRL_SURF_COPY_BLT \|
563	src_access << SRC_ACCESS_TYPE_SHIFT \|
564	dst_access << DST_ACCESS_TYPE_SHIFT \|
565	((num_ccs_blks - `1`) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT;
566	*cs++ = src_offset;
567	*cs++ = rq->engine->instance \|
568	FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
569	*cs++ = dst_offset;
570	*cs++ = rq->engine->instance \|
571	FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
572
573	cs = i915_flush_dw(cmd: cs, MI_FLUSH_DW_LLC \| MI_FLUSH_DW_CCS);
574	*cs++ = MI_NOOP;
575
576	intel_ring_advance(rq, cs);
577
578	return `0`;
579	}
580
581	static int emit_copy(struct i915_request *rq,
582	u32 dst_offset, u32 src_offset, int size)
583	{
584	const int ver = GRAPHICS_VER(rq->i915);
585	u32 instance = rq->engine->instance;
586	u32 *cs;
587
588	cs = intel_ring_begin(rq, num_dwords: ver >= `8` ? `10` : `6`);
589	if (IS_ERR(ptr: cs))
590	return PTR_ERR(ptr: cs);
591
592	if (ver >= `9` && !wa_1209644611_applies(ver, size)) {
593	*cs++ = GEN9_XY_FAST_COPY_BLT_CMD \| (`10` - `2`);
594	*cs++ = BLT_DEPTH_32 \| PAGE_SIZE;
595	*cs++ = `0`;
596	*cs++ = size >> PAGE_SHIFT << `16` \| PAGE_SIZE / `4`;
597	*cs++ = dst_offset;
598	*cs++ = instance;
599	*cs++ = `0`;
600	*cs++ = PAGE_SIZE;
601	*cs++ = src_offset;
602	*cs++ = instance;
603	} else if (ver >= `8`) {
604	*cs++ = XY_SRC_COPY_BLT_CMD \| BLT_WRITE_RGBA \| (`10` - `2`);
605	*cs++ = BLT_DEPTH_32 \| BLT_ROP_SRC_COPY \| PAGE_SIZE;
606	*cs++ = `0`;
607	*cs++ = size >> PAGE_SHIFT << `16` \| PAGE_SIZE / `4`;
608	*cs++ = dst_offset;
609	*cs++ = instance;
610	*cs++ = `0`;
611	*cs++ = PAGE_SIZE;
612	*cs++ = src_offset;
613	*cs++ = instance;
614	} else {
615	GEM_BUG_ON(instance);
616	*cs++ = SRC_COPY_BLT_CMD \| BLT_WRITE_RGBA \| (`6` - `2`);
617	*cs++ = BLT_DEPTH_32 \| BLT_ROP_SRC_COPY \| PAGE_SIZE;
618	*cs++ = size >> PAGE_SHIFT << `16` \| PAGE_SIZE;
619	*cs++ = dst_offset;
620	*cs++ = PAGE_SIZE;
621	*cs++ = src_offset;
622	}
623
624	intel_ring_advance(rq, cs);
625	return `0`;
626	}
627
628	static u64 scatter_list_length(struct scatterlist *sg)
629	{
630	u64 len = `0`;
631
632	while (sg && sg_dma_len(sg)) {
633	len += sg_dma_len(sg);
634	sg = sg_next(sg);
635	}
636
637	return len;
638	}
639
640	static int
641	calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
642	u64 bytes_to_cpy, u64 ccs_bytes_to_cpy)
643	{
644	if (ccs_bytes_to_cpy && !src_is_lmem)
645	/*
646	* When CHUNK_SZ is passed all the pages upto CHUNK_SZ
647	* will be taken for the blt. in Flat-ccs supported
648	* platform Smem obj will have more pages than required
649	* for main memory hence limit it to the required size
650	* for main memory
651	*/
652	return min_t(u64, bytes_to_cpy, CHUNK_SZ);
653	else
654	return CHUNK_SZ;
655	}
656
657	static void get_ccs_sg_sgt(struct sgt_dma *it, u64 bytes_to_cpy)
658	{
659	u64 len;
660
661	do {
662	GEM_BUG_ON(!it->sg \|\| !sg_dma_len(it->sg));
663	len = it->max - it->dma;
664	if (len > bytes_to_cpy) {
665	it->dma += bytes_to_cpy;
666	break;
667	}
668
669	bytes_to_cpy -= len;
670
671	it->sg = __sg_next(sg: it->sg);
672	it->dma = sg_dma_address(it->sg);
673	it->max = it->dma + sg_dma_len(it->sg);
674	} while (bytes_to_cpy);
675	}
676
677	int
678	intel_context_migrate_copy(struct intel_context *ce,
679	const struct i915_deps *deps,
680	struct scatterlist *src,
681	unsigned int src_pat_index,
682	bool src_is_lmem,
683	struct scatterlist *dst,
684	unsigned int dst_pat_index,
685	bool dst_is_lmem,
686	struct i915_request **out)
687	{
688	struct sgt_dma it_src = sg_sgt(sg: src), it_dst = sg_sgt(sg: dst), it_ccs;
689	struct drm_i915_private *i915 = ce->engine->i915;
690	u64 ccs_bytes_to_cpy = `0`, bytes_to_cpy;
691	unsigned int ccs_pat_index;
692	u32 src_offset, dst_offset;
693	u8 src_access, dst_access;
694	struct i915_request *rq;
695	u64 src_sz, dst_sz;
696	bool ccs_is_src, overwrite_ccs;
697	int err;
698
699	GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
700	GEM_BUG_ON(IS_DGFX(ce->engine->i915) && (!src_is_lmem && !dst_is_lmem));
701	*out = NULL;
702
703	GEM_BUG_ON(ce->ring->size < SZ_64K);
704
705	src_sz = scatter_list_length(sg: src);
706	bytes_to_cpy = src_sz;
707
708	if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
709	src_access = !src_is_lmem && dst_is_lmem;
710	dst_access = !src_access;
711
712	dst_sz = scatter_list_length(sg: dst);
713	if (src_is_lmem) {
714	it_ccs = it_dst;
715	ccs_pat_index = dst_pat_index;
716	ccs_is_src = false;
717	} else if (dst_is_lmem) {
718	bytes_to_cpy = dst_sz;
719	it_ccs = it_src;
720	ccs_pat_index = src_pat_index;
721	ccs_is_src = true;
722	}
723
724	/*
725	* When there is a eviction of ccs needed smem will have the
726	* extra pages for the ccs data
727	*
728	* TO-DO: Want to move the size mismatch check to a WARN_ON,
729	* but still we have some requests of smem->lmem with same size.
730	* Need to fix it.
731	*/
732	ccs_bytes_to_cpy = src_sz != dst_sz ? GET_CCS_BYTES(i915, bytes_to_cpy) : `0`;
733	if (ccs_bytes_to_cpy)
734	get_ccs_sg_sgt(it: &it_ccs, bytes_to_cpy);
735	}
736
737	overwrite_ccs = HAS_FLAT_CCS(i915) && !ccs_bytes_to_cpy && dst_is_lmem;
738
739	src_offset = `0`;
740	dst_offset = CHUNK_SZ;
741	if (HAS_64K_PAGES(ce->engine->i915)) {
742	src_offset = `0`;
743	dst_offset = `0`;
744	if (src_is_lmem)
745	src_offset = CHUNK_SZ;
746	if (dst_is_lmem)
747	dst_offset = `2` * CHUNK_SZ;
748	}
749
750	do {
751	int len;
752
753	rq = i915_request_create(ce);
754	if (IS_ERR(ptr: rq)) {
755	err = PTR_ERR(ptr: rq);
756	goto out_ce;
757	}
758
759	if (deps) {
760	err = i915_request_await_deps(rq, deps);
761	if (err)
762	goto out_rq;
763
764	if (rq->engine->emit_init_breadcrumb) {
765	err = rq->engine->emit_init_breadcrumb(rq);
766	if (err)
767	goto out_rq;
768	}
769
770	deps = NULL;
771	}
772
773	/ The PTE updates + copy must not be interrupted. /
774	err = emit_no_arbitration(rq);
775	if (err)
776	goto out_rq;
777
778	src_sz = calculate_chunk_sz(i915, src_is_lmem,
779	bytes_to_cpy, ccs_bytes_to_cpy);
780
781	len = emit_pte(rq, it: &it_src, pat_index: src_pat_index, is_lmem: src_is_lmem,
782	offset: src_offset, length: src_sz);
783	if (!len) {
784	err = -EINVAL;
785	goto out_rq;
786	}
787	if (len < `0`) {
788	err = len;
789	goto out_rq;
790	}
791
792	err = emit_pte(rq, it: &it_dst, pat_index: dst_pat_index, is_lmem: dst_is_lmem,
793	offset: dst_offset, length: len);
794	if (err < `0`)
795	goto out_rq;
796	if (err < len) {
797	err = -EINVAL;
798	goto out_rq;
799	}
800
801	err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
802	if (err)
803	goto out_rq;
804
805	err = emit_copy(rq, dst_offset, src_offset, size: len);
806	if (err)
807	goto out_rq;
808
809	bytes_to_cpy -= len;
810
811	if (ccs_bytes_to_cpy) {
812	int ccs_sz;
813
814	err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
815	if (err)
816	goto out_rq;
817
818	ccs_sz = GET_CCS_BYTES(i915, len);
819	err = emit_pte(rq, it: &it_ccs, pat_index: ccs_pat_index, is_lmem: false,
820	offset: ccs_is_src ? src_offset : dst_offset,
821	length: ccs_sz);
822	if (err < `0`)
823	goto out_rq;
824	if (err < ccs_sz) {
825	err = -EINVAL;
826	goto out_rq;
827	}
828
829	err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
830	if (err)
831	goto out_rq;
832
833	err = emit_copy_ccs(rq, dst_offset, dst_access,
834	src_offset, src_access, size: len);
835	if (err)
836	goto out_rq;
837
838	err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
839	if (err)
840	goto out_rq;
841	ccs_bytes_to_cpy -= ccs_sz;
842	} else if (overwrite_ccs) {
843	err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
844	if (err)
845	goto out_rq;
846
847	if (src_is_lmem) {
848	/*
849	* If the src is already in lmem, then we must
850	* be doing an lmem -> lmem transfer, and so
851	* should be safe to directly copy the CCS
852	* state. In this case we have either
853	* initialised the CCS aux state when first
854	* clearing the pages (since it is already
855	* allocated in lmem), or the user has
856	* potentially populated it, in which case we
857	* need to copy the CCS state as-is.
858	*/
859	err = emit_copy_ccs(rq,
860	dst_offset, INDIRECT_ACCESS,
861	src_offset, INDIRECT_ACCESS,
862	size: len);
863	} else {
864	/*
865	* While we can't always restore/manage the CCS
866	* state, we still need to ensure we don't leak
867	* the CCS state from the previous user, so make
868	* sure we overwrite it with something.
869	*/
870	err = emit_copy_ccs(rq,
871	dst_offset, INDIRECT_ACCESS,
872	src_offset: dst_offset, DIRECT_ACCESS,
873	size: len);
874	}
875
876	if (err)
877	goto out_rq;
878
879	err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
880	if (err)
881	goto out_rq;
882	}
883
884	/ Arbitration is re-enabled between requests. /
885	out_rq:
886	if (*out)
887	i915_request_put(rq: *out);
888	*out = i915_request_get(rq);
889	i915_request_add(rq);
890
891	if (err)
892	break;
893
894	if (!bytes_to_cpy && !ccs_bytes_to_cpy) {
895	if (src_is_lmem)
896	WARN_ON(it_src.sg && sg_dma_len(it_src.sg));
897	else
898	WARN_ON(it_dst.sg && sg_dma_len(it_dst.sg));
899	break;
900	}
901
902	if (WARN_ON(!it_src.sg \|\| !sg_dma_len(it_src.sg) \|\|
903	!it_dst.sg \|\| !sg_dma_len(it_dst.sg) \|\|
904	(ccs_bytes_to_cpy && (!it_ccs.sg \|\|
905	!sg_dma_len(it_ccs.sg))))) {
906	err = -EINVAL;
907	break;
908	}
909
910	cond_resched();
911	} while (`1`);
912
913	out_ce:
914	return err;
915	}
916
917	static int emit_clear(struct i915_request rq, u32 offset, int* size,
918	u32 value, bool is_lmem)
919	{
920	struct drm_i915_private *i915 = rq->i915;
921	int mocs = rq->engine->gt->mocs.uc_index << `1`;
922	const int ver = GRAPHICS_VER(i915);
923	int ring_sz;
924	u32 *cs;
925
926	GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
927
928	if (GRAPHICS_VER_FULL(i915) >= IP_VER(`12`, `55`))
929	ring_sz = XY_FAST_COLOR_BLT_DW;
930	else if (ver >= `8`)
931	ring_sz = `8`;
932	else
933	ring_sz = `6`;
934
935	cs = intel_ring_begin(rq, num_dwords: ring_sz);
936	if (IS_ERR(ptr: cs))
937	return PTR_ERR(ptr: cs);
938
939	if (GRAPHICS_VER_FULL(i915) >= IP_VER(`12`, `55`)) {
940	*cs++ = XY_FAST_COLOR_BLT_CMD \| XY_FAST_COLOR_BLT_DEPTH_32 \|
941	(XY_FAST_COLOR_BLT_DW - `2`);
942	*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) \|
943	(PAGE_SIZE - `1`);
944	*cs++ = `0`;
945	*cs++ = size >> PAGE_SHIFT << `16` \| PAGE_SIZE / `4`;
946	*cs++ = offset;
947	*cs++ = rq->engine->instance;
948	*cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
949	/ BG7 /
950	*cs++ = value;
951	*cs++ = `0`;
952	*cs++ = `0`;
953	*cs++ = `0`;
954	/ BG11 /
955	*cs++ = `0`;
956	*cs++ = `0`;
957	/ BG13 /
958	*cs++ = `0`;
959	*cs++ = `0`;
960	*cs++ = `0`;
961	} else if (ver >= `8`) {
962	*cs++ = XY_COLOR_BLT_CMD \| BLT_WRITE_RGBA \| (`7` - `2`);
963	*cs++ = BLT_DEPTH_32 \| BLT_ROP_COLOR_COPY \| PAGE_SIZE;
964	*cs++ = `0`;
965	*cs++ = size >> PAGE_SHIFT << `16` \| PAGE_SIZE / `4`;
966	*cs++ = offset;
967	*cs++ = rq->engine->instance;
968	*cs++ = value;
969	*cs++ = MI_NOOP;
970	} else {
971	*cs++ = XY_COLOR_BLT_CMD \| BLT_WRITE_RGBA \| (`6` - `2`);
972	*cs++ = BLT_DEPTH_32 \| BLT_ROP_COLOR_COPY \| PAGE_SIZE;
973	*cs++ = `0`;
974	*cs++ = size >> PAGE_SHIFT << `16` \| PAGE_SIZE / `4`;
975	*cs++ = offset;
976	*cs++ = value;
977	}
978
979	intel_ring_advance(rq, cs);
980	return `0`;
981	}
982
983	int
984	intel_context_migrate_clear(struct intel_context *ce,
985	const struct i915_deps *deps,
986	struct scatterlist *sg,
987	unsigned int pat_index,
988	bool is_lmem,
989	u32 value,
990	struct i915_request **out)
991	{
992	struct drm_i915_private *i915 = ce->engine->i915;
993	struct sgt_dma it = sg_sgt(sg);
994	struct i915_request *rq;
995	u32 offset;
996	int err;
997
998	GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
999	*out = NULL;
1000
1001	GEM_BUG_ON(ce->ring->size < SZ_64K);
1002
1003	offset = `0`;
1004	if (HAS_64K_PAGES(i915) && is_lmem)
1005	offset = CHUNK_SZ;
1006
1007	do {
1008	int len;
1009
1010	rq = i915_request_create(ce);
1011	if (IS_ERR(ptr: rq)) {
1012	err = PTR_ERR(ptr: rq);
1013	goto out_ce;
1014	}
1015
1016	if (deps) {
1017	err = i915_request_await_deps(rq, deps);
1018	if (err)
1019	goto out_rq;
1020
1021	if (rq->engine->emit_init_breadcrumb) {
1022	err = rq->engine->emit_init_breadcrumb(rq);
1023	if (err)
1024	goto out_rq;
1025	}
1026
1027	deps = NULL;
1028	}
1029
1030	/ The PTE updates + clear must not be interrupted. /
1031	err = emit_no_arbitration(rq);
1032	if (err)
1033	goto out_rq;
1034
1035	len = emit_pte(rq, it: &it, pat_index, is_lmem, offset, CHUNK_SZ);
1036	if (len <= `0`) {
1037	err = len;
1038	goto out_rq;
1039	}
1040
1041	err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
1042	if (err)
1043	goto out_rq;
1044
1045	err = emit_clear(rq, offset, size: len, value, is_lmem);
1046	if (err)
1047	goto out_rq;
1048
1049	if (HAS_FLAT_CCS(i915) && is_lmem && !value) {
1050	/*
1051	* copy the content of memory into corresponding
1052	* ccs surface
1053	*/
1054	err = emit_copy_ccs(rq, dst_offset: offset, INDIRECT_ACCESS, src_offset: offset,
1055	DIRECT_ACCESS, size: len);
1056	if (err)
1057	goto out_rq;
1058	}
1059
1060	err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
1061
1062	/ Arbitration is re-enabled between requests. /
1063	out_rq:
1064	if (*out)
1065	i915_request_put(rq: *out);
1066	*out = i915_request_get(rq);
1067	i915_request_add(rq);
1068	if (err \|\| !it.sg \|\| !sg_dma_len(it.sg))
1069	break;
1070
1071	cond_resched();
1072	} while (`1`);
1073
1074	out_ce:
1075	return err;
1076	}
1077
1078	int intel_migrate_copy(struct intel_migrate *m,
1079	struct i915_gem_ww_ctx *ww,
1080	const struct i915_deps *deps,
1081	struct scatterlist *src,
1082	unsigned int src_pat_index,
1083	bool src_is_lmem,
1084	struct scatterlist *dst,
1085	unsigned int dst_pat_index,
1086	bool dst_is_lmem,
1087	struct i915_request **out)
1088	{
1089	struct intel_context *ce;
1090	int err;
1091
1092	*out = NULL;
1093	if (!m->context)
1094	return -ENODEV;
1095
1096	ce = intel_migrate_create_context(m);
1097	if (IS_ERR(ptr: ce))
1098	ce = intel_context_get(ce: m->context);
1099	GEM_BUG_ON(IS_ERR(ce));
1100
1101	err = intel_context_pin_ww(ce, ww);
1102	if (err)
1103	goto out;
1104
1105	err = intel_context_migrate_copy(ce, deps,
1106	src, src_pat_index, src_is_lmem,
1107	dst, dst_pat_index, dst_is_lmem,
1108	out);
1109
1110	intel_context_unpin(ce);
1111	out:
1112	intel_context_put(ce);
1113	return err;
1114	}
1115
1116	int
1117	intel_migrate_clear(struct intel_migrate *m,
1118	struct i915_gem_ww_ctx *ww,
1119	const struct i915_deps *deps,
1120	struct scatterlist *sg,
1121	unsigned int pat_index,
1122	bool is_lmem,
1123	u32 value,
1124	struct i915_request **out)
1125	{
1126	struct intel_context *ce;
1127	int err;
1128
1129	*out = NULL;
1130	if (!m->context)
1131	return -ENODEV;
1132
1133	ce = intel_migrate_create_context(m);
1134	if (IS_ERR(ptr: ce))
1135	ce = intel_context_get(ce: m->context);
1136	GEM_BUG_ON(IS_ERR(ce));
1137
1138	err = intel_context_pin_ww(ce, ww);
1139	if (err)
1140	goto out;
1141
1142	err = intel_context_migrate_clear(ce, deps, sg, pat_index,
1143	is_lmem, value, out);
1144
1145	intel_context_unpin(ce);
1146	out:
1147	intel_context_put(ce);
1148	return err;
1149	}
1150
1151	void intel_migrate_fini(struct intel_migrate *m)
1152	{
1153	struct intel_context *ce;
1154
1155	ce = fetch_and_zero(&m->context);
1156	if (!ce)
1157	return;
1158
1159	intel_engine_destroy_pinned_context(ce);
1160	}
1161
1162	#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1163	#include "selftest_migrate.c"
1164	#endif
1165

source code of linux/drivers/gpu/drm/i915/gt/intel_migrate.c