gen8_engine_cs.c source code [linux/drivers/gpu/drm/i915/gt/gen8_engine_cs.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2014 Intel Corporation
4	*/
5
6	#include <drm/drm_print.h>
7
8	#include "gen8_engine_cs.h"
9	#include "intel_engine_regs.h"
10	#include "intel_gpu_commands.h"
11	#include "intel_gt.h"
12	#include "intel_lrc.h"
13	#include "intel_ring.h"
14
15	int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
16	{
17	bool vf_flush_wa = false, dc_flush_wa = false;
18	u32 *cs, flags = `0`;
19	int len;
20
21	flags \|= PIPE_CONTROL_CS_STALL;
22
23	if (mode & EMIT_FLUSH) {
24	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
25	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
26	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
27	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
28	}
29
30	if (mode & EMIT_INVALIDATE) {
31	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
32	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
33	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
34	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
35	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
36	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
37	flags \|= PIPE_CONTROL_QW_WRITE;
38	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
39
40	/*
41	* On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
42	* pipe control.
43	*/
44	if (GRAPHICS_VER(rq->i915) == `9`)
45	vf_flush_wa = true;
46
47	/ WaForGAMHang:kbl /
48	if (IS_KABYLAKE(rq->i915) && IS_GRAPHICS_STEP(rq->i915, `0`, STEP_C0))
49	dc_flush_wa = true;
50	}
51
52	len = `6`;
53
54	if (vf_flush_wa)
55	len += `6`;
56
57	if (dc_flush_wa)
58	len += `12`;
59
60	cs = intel_ring_begin(rq, num_dwords: len);
61	if (IS_ERR(ptr: cs))
62	return PTR_ERR(ptr: cs);
63
64	if (vf_flush_wa)
65	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: `0`, offset: `0`);
66
67	if (dc_flush_wa)
68	cs = gen8_emit_pipe_control(batch: cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
69	offset: `0`);
70
71	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR);
72
73	if (dc_flush_wa)
74	cs = gen8_emit_pipe_control(batch: cs, PIPE_CONTROL_CS_STALL, offset: `0`);
75
76	intel_ring_advance(rq, cs);
77
78	return `0`;
79	}
80
81	int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
82	{
83	u32 cmd, *cs;
84
85	cs = intel_ring_begin(rq, num_dwords: `4`);
86	if (IS_ERR(ptr: cs))
87	return PTR_ERR(ptr: cs);
88
89	cmd = MI_FLUSH_DW + `1`;
90
91	/*
92	* We always require a command barrier so that subsequent
93	* commands, such as breadcrumb interrupts, are strictly ordered
94	* wrt the contents of the write cache being flushed to memory
95	* (and thus being coherent from the CPU).
96	*/
97	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;
98
99	if (mode & EMIT_INVALIDATE) {
100	cmd \|= MI_INVALIDATE_TLB;
101	if (rq->engine->class == VIDEO_DECODE_CLASS)
102	cmd \|= MI_INVALIDATE_BSD;
103	}
104
105	*cs++ = cmd;
106	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
107	cs++ = `0`; /* upper addr /
108	cs++ = `0`; /* value /
109	intel_ring_advance(rq, cs);
110
111	return `0`;
112	}
113
114	int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
115	{
116	if (mode & EMIT_FLUSH) {
117	u32 *cs;
118	u32 flags = `0`;
119
120	flags \|= PIPE_CONTROL_CS_STALL;
121
122	flags \|= PIPE_CONTROL_TILE_CACHE_FLUSH;
123	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
124	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
125	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
126	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
127	flags \|= PIPE_CONTROL_QW_WRITE;
128	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
129
130	cs = intel_ring_begin(rq, num_dwords: `6`);
131	if (IS_ERR(ptr: cs))
132	return PTR_ERR(ptr: cs);
133
134	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR);
135	intel_ring_advance(rq, cs);
136	}
137
138	if (mode & EMIT_INVALIDATE) {
139	u32 *cs;
140	u32 flags = `0`;
141
142	flags \|= PIPE_CONTROL_CS_STALL;
143
144	flags \|= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
145	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
146	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
147	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
148	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
149	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
150	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
151	flags \|= PIPE_CONTROL_QW_WRITE;
152	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
153
154	cs = intel_ring_begin(rq, num_dwords: `6`);
155	if (IS_ERR(ptr: cs))
156	return PTR_ERR(ptr: cs);
157
158	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR);
159	intel_ring_advance(rq, cs);
160	}
161
162	return `0`;
163	}
164
165	static u32 preparser_disable(bool state)
166	{
167	return MI_ARB_CHECK \| `1` << `8` \| state;
168	}
169
170	static i915_reg_t gen12_get_aux_inv_reg(struct intel_engine_cs *engine)
171	{
172	switch (engine->id) {
173	case RCS0:
174	return GEN12_CCS_AUX_INV;
175	case BCS0:
176	return GEN12_BCS0_AUX_INV;
177	case VCS0:
178	return GEN12_VD0_AUX_INV;
179	case VCS2:
180	return GEN12_VD2_AUX_INV;
181	case VECS0:
182	return GEN12_VE0_AUX_INV;
183	case CCS0:
184	return GEN12_CCS0_AUX_INV;
185	default:
186	return INVALID_MMIO_REG;
187	}
188	}
189
190	static bool gen12_needs_ccs_aux_inv(struct intel_engine_cs *engine)
191	{
192	i915_reg_t reg = gen12_get_aux_inv_reg(engine);
193
194	/*
195	* So far platforms supported by i915 having flat ccs do not require
196	* AUX invalidation. Check also whether the engine requires it.
197	*/
198	return i915_mmio_reg_valid(reg) && !HAS_FLAT_CCS(engine->i915);
199	}
200
201	u32 gen12_emit_aux_table_inv(struct* intel_engine_cs engine, u32 cs)
202	{
203	i915_reg_t inv_reg = gen12_get_aux_inv_reg(engine);
204	u32 gsi_offset = engine->gt->uncore->gsi_offset;
205
206	if (!gen12_needs_ccs_aux_inv(engine))
207	return cs;
208
209	*cs++ = MI_LOAD_REGISTER_IMM(`1`) \| MI_LRI_MMIO_REMAP_EN;
210	*cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset;
211	*cs++ = AUX_INV;
212
213	*cs++ = MI_SEMAPHORE_WAIT_TOKEN \|
214	MI_SEMAPHORE_REGISTER_POLL \|
215	MI_SEMAPHORE_POLL \|
216	MI_SEMAPHORE_SAD_EQ_SDD;
217	*cs++ = `0`;
218	*cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset;
219	*cs++ = `0`;
220	*cs++ = `0`;
221
222	return cs;
223	}
224
225	static int mtl_dummy_pipe_control(struct i915_request *rq)
226	{
227	/ Wa_14016712196 /
228	if (IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(`12`, `70`), IP_VER(`12`, `74`)) \|\|
229	IS_DG2(rq->i915)) {
230	u32 *cs;
231
232	/ dummy PIPE_CONTROL + depth flush /
233	cs = intel_ring_begin(rq, num_dwords: `6`);
234	if (IS_ERR(ptr: cs))
235	return PTR_ERR(ptr: cs);
236	cs = gen12_emit_pipe_control(batch: cs,
237	bit_group_0: `0`,
238	PIPE_CONTROL_DEPTH_CACHE_FLUSH,
239	LRC_PPHWSP_SCRATCH_ADDR);
240	intel_ring_advance(rq, cs);
241	}
242
243	return `0`;
244	}
245
246	int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
247	{
248	struct intel_engine_cs *engine = rq->engine;
249
250	/*
251	* On Aux CCS platforms the invalidation of the Aux
252	* table requires quiescing memory traffic beforehand
253	*/
254	if (mode & EMIT_FLUSH \|\| gen12_needs_ccs_aux_inv(engine)) {
255	u32 bit_group_0 = `0`;
256	u32 bit_group_1 = `0`;
257	int err;
258	u32 *cs;
259
260	err = mtl_dummy_pipe_control(rq);
261	if (err)
262	return err;
263
264	bit_group_0 \|= PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
265
266	/*
267	* When required, in MTL and beyond platforms we
268	* need to set the CCS_FLUSH bit in the pipe control
269	*/
270	if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(`12`, `70`))
271	bit_group_0 \|= PIPE_CONTROL_CCS_FLUSH;
272
273	/*
274	* L3 fabric flush is needed for AUX CCS invalidation
275	* which happens as part of pipe-control so we can
276	* ignore PIPE_CONTROL_FLUSH_L3. Also PIPE_CONTROL_FLUSH_L3
277	* deals with Protected Memory which is not needed for
278	* AUX CCS invalidation and lead to unwanted side effects.
279	*/
280	if ((mode & EMIT_FLUSH) &&
281	GRAPHICS_VER_FULL(rq->i915) < IP_VER(`12`, `70`))
282	bit_group_1 \|= PIPE_CONTROL_FLUSH_L3;
283
284	bit_group_1 \|= PIPE_CONTROL_TILE_CACHE_FLUSH;
285	bit_group_1 \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
286	bit_group_1 \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
287	/ Wa_1409600907:tgl,adl-p /
288	bit_group_1 \|= PIPE_CONTROL_DEPTH_STALL;
289	bit_group_1 \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
290	bit_group_1 \|= PIPE_CONTROL_FLUSH_ENABLE;
291
292	bit_group_1 \|= PIPE_CONTROL_STORE_DATA_INDEX;
293	bit_group_1 \|= PIPE_CONTROL_QW_WRITE;
294
295	bit_group_1 \|= PIPE_CONTROL_CS_STALL;
296
297	if (!HAS_3D_PIPELINE(engine->i915))
298	bit_group_1 &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
299	else if (engine->class == COMPUTE_CLASS)
300	bit_group_1 &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
301
302	cs = intel_ring_begin(rq, num_dwords: `6`);
303	if (IS_ERR(ptr: cs))
304	return PTR_ERR(ptr: cs);
305
306	cs = gen12_emit_pipe_control(batch: cs, bit_group_0, bit_group_1,
307	LRC_PPHWSP_SCRATCH_ADDR);
308	intel_ring_advance(rq, cs);
309	}
310
311	if (mode & EMIT_INVALIDATE) {
312	u32 flags = `0`;
313	u32 *cs, count;
314	int err;
315
316	err = mtl_dummy_pipe_control(rq);
317	if (err)
318	return err;
319
320	flags \|= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
321	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
322	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
323	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
324	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
325	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
326	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
327
328	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
329	flags \|= PIPE_CONTROL_QW_WRITE;
330
331	flags \|= PIPE_CONTROL_CS_STALL;
332
333	if (!HAS_3D_PIPELINE(engine->i915))
334	flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
335	else if (engine->class == COMPUTE_CLASS)
336	flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
337
338	count = `8`;
339	if (gen12_needs_ccs_aux_inv(engine: rq->engine))
340	count += `8`;
341
342	cs = intel_ring_begin(rq, num_dwords: count);
343	if (IS_ERR(ptr: cs))
344	return PTR_ERR(ptr: cs);
345
346	/*
347	* Prevent the pre-parser from skipping past the TLB
348	* invalidate and loading a stale page for the batch
349	* buffer / request payload.
350	*/
351	*cs++ = preparser_disable(state: true);
352
353	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR);
354
355	cs = gen12_emit_aux_table_inv(engine, cs);
356
357	*cs++ = preparser_disable(state: false);
358	intel_ring_advance(rq, cs);
359	}
360
361	return `0`;
362	}
363
364	int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
365	{
366	u32 cmd = `4`;
367	u32 *cs;
368
369	if (mode & EMIT_INVALIDATE) {
370	cmd += `2`;
371
372	if (gen12_needs_ccs_aux_inv(engine: rq->engine))
373	cmd += `8`;
374	}
375
376	cs = intel_ring_begin(rq, num_dwords: cmd);
377	if (IS_ERR(ptr: cs))
378	return PTR_ERR(ptr: cs);
379
380	if (mode & EMIT_INVALIDATE)
381	*cs++ = preparser_disable(state: true);
382
383	cmd = MI_FLUSH_DW + `1`;
384
385	/*
386	* We always require a command barrier so that subsequent
387	* commands, such as breadcrumb interrupts, are strictly ordered
388	* wrt the contents of the write cache being flushed to memory
389	* (and thus being coherent from the CPU).
390	*/
391	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;
392
393	if (mode & EMIT_INVALIDATE) {
394	cmd \|= MI_INVALIDATE_TLB;
395	if (rq->engine->class == VIDEO_DECODE_CLASS)
396	cmd \|= MI_INVALIDATE_BSD;
397
398	if (gen12_needs_ccs_aux_inv(engine: rq->engine) &&
399	rq->engine->class == COPY_ENGINE_CLASS)
400	cmd \|= MI_FLUSH_DW_CCS;
401	}
402
403	*cs++ = cmd;
404	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
405	cs++ = `0`; /* upper addr /
406	cs++ = `0`; /* value /
407
408	cs = gen12_emit_aux_table_inv(engine: rq->engine, cs);
409
410	if (mode & EMIT_INVALIDATE)
411	*cs++ = preparser_disable(state: false);
412
413	intel_ring_advance(rq, cs);
414
415	return `0`;
416	}
417
418	static u32 preempt_address(struct intel_engine_cs *engine)
419	{
420	return (i915_ggtt_offset(vma: engine->status_page.vma) +
421	I915_GEM_HWS_PREEMPT_ADDR);
422	}
423
424	static u32 hwsp_offset(const struct i915_request *rq)
425	{
426	const struct intel_timeline *tl;
427
428	/ Before the request is executed, the timeline is fixed /
429	tl = rcu_dereference_protected(rq->timeline,
430	!i915_request_signaled(rq));
431
432	/ See the comment in i915_request_active_seqno(). /
433	return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno);
434	}
435
436	int gen8_emit_init_breadcrumb(struct i915_request *rq)
437	{
438	u32 *cs;
439
440	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
441	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
442	return `0`;
443
444	cs = intel_ring_begin(rq, num_dwords: `6`);
445	if (IS_ERR(ptr: cs))
446	return PTR_ERR(ptr: cs);
447
448	*cs++ = MI_STORE_DWORD_IMM_GEN4 \| MI_USE_GGTT;
449	*cs++ = hwsp_offset(rq);
450	*cs++ = `0`;
451	*cs++ = rq->fence.seqno - `1`;
452
453	/*
454	* Check if we have been preempted before we even get started.
455	*
456	* After this point i915_request_started() reports true, even if
457	* we get preempted and so are no longer running.
458	*
459	* i915_request_started() is used during preemption processing
460	* to decide if the request is currently inside the user payload
461	* or spinning on a kernel semaphore (or earlier). For no-preemption
462	* requests, we do allow preemption on the semaphore before the user
463	* payload, but do not allow preemption once the request is started.
464	*
465	* i915_request_started() is similarly used during GPU hangs to
466	* determine if the user's payload was guilty, and if so, the
467	* request is banned. Before the request is started, it is assumed
468	* to be unharmed and an innocent victim of another's hang.
469	*/
470	*cs++ = MI_NOOP;
471	*cs++ = MI_ARB_CHECK;
472
473	intel_ring_advance(rq, cs);
474
475	/ Record the updated position of the request's payload /
476	rq->infix = intel_ring_offset(rq, addr: cs);
477
478	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
479
480	return `0`;
481	}
482
483	static int __xehp_emit_bb_start(struct i915_request *rq,
484	u64 offset, u32 len,
485	const unsigned int flags,
486	u32 arb)
487	{
488	struct intel_context *ce = rq->context;
489	u32 wa_offset = lrc_indirect_bb(ce);
490	u32 *cs;
491
492	GEM_BUG_ON(!ce->wa_bb_page);
493
494	cs = intel_ring_begin(rq, num_dwords: `12`);
495	if (IS_ERR(ptr: cs))
496	return PTR_ERR(ptr: cs);
497
498	*cs++ = MI_ARB_ON_OFF \| arb;
499
500	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 \|
501	MI_SRM_LRM_GLOBAL_GTT \|
502	MI_LRI_LRM_CS_MMIO;
503	*cs++ = i915_mmio_reg_offset(RING_PREDICATE_RESULT(`0`));
504	*cs++ = wa_offset + DG2_PREDICATE_RESULT_WA;
505	*cs++ = `0`;
506
507	*cs++ = MI_BATCH_BUFFER_START_GEN8 \|
508	(flags & I915_DISPATCH_SECURE ? `0` : BIT(`8`));
509	*cs++ = lower_32_bits(offset);
510	*cs++ = upper_32_bits(offset);
511
512	/ Fixup stray MI_SET_PREDICATE as it prevents us executing the ring /
513	*cs++ = MI_BATCH_BUFFER_START_GEN8;
514	*cs++ = wa_offset + DG2_PREDICATE_RESULT_BB;
515	*cs++ = `0`;
516
517	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
518
519	intel_ring_advance(rq, cs);
520
521	return `0`;
522	}
523
524	int xehp_emit_bb_start_noarb(struct i915_request *rq,
525	u64 offset, u32 len,
526	const unsigned int flags)
527	{
528	return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_DISABLE);
529	}
530
531	int xehp_emit_bb_start(struct i915_request *rq,
532	u64 offset, u32 len,
533	const unsigned int flags)
534	{
535	return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_ENABLE);
536	}
537
538	int gen8_emit_bb_start_noarb(struct i915_request *rq,
539	u64 offset, u32 len,
540	const unsigned int flags)
541	{
542	u32 *cs;
543
544	cs = intel_ring_begin(rq, num_dwords: `4`);
545	if (IS_ERR(ptr: cs))
546	return PTR_ERR(ptr: cs);
547
548	/*
549	* WaDisableCtxRestoreArbitration:bdw,chv
550	*
551	* We don't need to perform MI_ARB_ENABLE as often as we do (in
552	* particular all the gen that do not need the w/a at all!), if we
553	* took care to make sure that on every switch into this context
554	* (both ordinary and for preemption) that arbitrartion was enabled
555	* we would be fine. However, for gen8 there is another w/a that
556	* requires us to not preempt inside GPGPU execution, so we keep
557	* arbitration disabled for gen8 batches. Arbitration will be
558	* re-enabled before we close the request
559	* (engine->emit_fini_breadcrumb).
560	*/
561	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
562
563	/ FIXME(BDW+): Address space and security selectors. /
564	*cs++ = MI_BATCH_BUFFER_START_GEN8 \|
565	(flags & I915_DISPATCH_SECURE ? `0` : BIT(`8`));
566	*cs++ = lower_32_bits(offset);
567	*cs++ = upper_32_bits(offset);
568
569	intel_ring_advance(rq, cs);
570
571	return `0`;
572	}
573
574	int gen8_emit_bb_start(struct i915_request *rq,
575	u64 offset, u32 len,
576	const unsigned int flags)
577	{
578	u32 *cs;
579
580	if (unlikely(i915_request_has_nopreempt(rq)))
581	return gen8_emit_bb_start_noarb(rq, offset, len, flags);
582
583	cs = intel_ring_begin(rq, num_dwords: `6`);
584	if (IS_ERR(ptr: cs))
585	return PTR_ERR(ptr: cs);
586
587	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
588
589	*cs++ = MI_BATCH_BUFFER_START_GEN8 \|
590	(flags & I915_DISPATCH_SECURE ? `0` : BIT(`8`));
591	*cs++ = lower_32_bits(offset);
592	*cs++ = upper_32_bits(offset);
593
594	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
595	*cs++ = MI_NOOP;
596
597	intel_ring_advance(rq, cs);
598
599	return `0`;
600	}
601
602	static void assert_request_valid(struct i915_request *rq)
603	{
604	struct intel_ring *ring __maybe_unused = rq->ring;
605
606	/ Can we unwind this request without appearing to go forwards? /
607	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= `0`);
608	}
609
610	/*
611	* Reserve space for 2 NOOPs at the end of each request to be
612	* used as a workaround for not being allowed to do lite
613	* restore with HEAD==TAIL (WaIdleLiteRestore).
614	*/
615	static u32 gen8_emit_wa_tail(struct* i915_request rq, u32 cs)
616	{
617	/ Ensure there's always at least one preemption point per-request. /
618	*cs++ = MI_ARB_CHECK;
619	*cs++ = MI_NOOP;
620	rq->wa_tail = intel_ring_offset(rq, addr: cs);
621
622	/ Check that entire request is less than half the ring /
623	assert_request_valid(rq);
624
625	return cs;
626	}
627
628	static u32 emit_preempt_busywait(struct* i915_request rq, u32 cs)
629	{
630	cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first /
631	*cs++ = MI_SEMAPHORE_WAIT \|
632	MI_SEMAPHORE_GLOBAL_GTT \|
633	MI_SEMAPHORE_POLL \|
634	MI_SEMAPHORE_SAD_EQ_SDD;
635	*cs++ = `0`;
636	*cs++ = preempt_address(engine: rq->engine);
637	*cs++ = `0`;
638	*cs++ = MI_NOOP;
639
640	return cs;
641	}
642
643	static __always_inline u32*
644	gen8_emit_fini_breadcrumb_tail(struct i915_request rq, u32 cs)
645	{
646	*cs++ = MI_USER_INTERRUPT;
647
648	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
649	if (intel_engine_has_semaphores(engine: rq->engine) &&
650	!intel_uc_uses_guc_submission(uc: &rq->engine->gt->uc))
651	cs = emit_preempt_busywait(rq, cs);
652
653	rq->tail = intel_ring_offset(rq, addr: cs);
654	assert_ring_tail_valid(ring: rq->ring, tail: rq->tail);
655
656	return gen8_emit_wa_tail(rq, cs);
657	}
658
659	static u32 emit_xcs_breadcrumb(struct* i915_request rq, u32 cs)
660	{
661	return gen8_emit_ggtt_write(cs, value: rq->fence.seqno, gtt_offset: hwsp_offset(rq), flags: `0`);
662	}
663
664	u32 gen8_emit_fini_breadcrumb_xcs(struct* i915_request rq, u32 cs)
665	{
666	return gen8_emit_fini_breadcrumb_tail(rq, cs: emit_xcs_breadcrumb(rq, cs));
667	}
668
669	u32 gen8_emit_fini_breadcrumb_rcs(struct* i915_request rq, u32 cs)
670	{
671	cs = gen8_emit_pipe_control(batch: cs,
672	PIPE_CONTROL_CS_STALL \|
673	PIPE_CONTROL_TLB_INVALIDATE \|
674	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
675	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
676	PIPE_CONTROL_DC_FLUSH_ENABLE,
677	offset: `0`);
678
679	/ XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl /
680	cs = gen8_emit_ggtt_write_rcs(cs,
681	value: rq->fence.seqno,
682	gtt_offset: hwsp_offset(rq),
683	PIPE_CONTROL_FLUSH_ENABLE \|
684	PIPE_CONTROL_CS_STALL);
685
686	return gen8_emit_fini_breadcrumb_tail(rq, cs);
687	}
688
689	u32 gen11_emit_fini_breadcrumb_rcs(struct* i915_request rq, u32 cs)
690	{
691	cs = gen8_emit_pipe_control(batch: cs,
692	PIPE_CONTROL_CS_STALL \|
693	PIPE_CONTROL_TLB_INVALIDATE \|
694	PIPE_CONTROL_TILE_CACHE_FLUSH \|
695	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
696	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
697	PIPE_CONTROL_DC_FLUSH_ENABLE,
698	offset: `0`);
699
700	/XXX: Look at gen8_emit_fini_breadcrumb_rcs /
701	cs = gen8_emit_ggtt_write_rcs(cs,
702	value: rq->fence.seqno,
703	gtt_offset: hwsp_offset(rq),
704	PIPE_CONTROL_FLUSH_ENABLE \|
705	PIPE_CONTROL_CS_STALL);
706
707	return gen8_emit_fini_breadcrumb_tail(rq, cs);
708	}
709
710	/*
711	* Note that the CS instruction pre-parser will not stall on the breadcrumb
712	* flush and will continue pre-fetching the instructions after it before the
713	* memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
714	* BB_START/END instructions, so, even though we might pre-fetch the pre-amble
715	* of the next request before the memory has been flushed, we're guaranteed that
716	* we won't access the batch itself too early.
717	* However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
718	* so, if the current request is modifying an instruction in the next request on
719	* the same intel_context, we might pre-fetch and then execute the pre-update
720	* instruction. To avoid this, the users of self-modifying code should either
721	* disable the parser around the code emitting the memory writes, via a new flag
722	* added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
723	* the in-kernel use-cases we've opted to use a separate context, see
724	* reloc_gpu() as an example.
725	* All the above applies only to the instructions themselves. Non-inline data
726	* used by the instructions is not pre-fetched.
727	*/
728
729	static u32 gen12_emit_preempt_busywait(struct* i915_request rq, u32 cs)
730	{
731	cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first /
732	*cs++ = MI_SEMAPHORE_WAIT_TOKEN \|
733	MI_SEMAPHORE_GLOBAL_GTT \|
734	MI_SEMAPHORE_POLL \|
735	MI_SEMAPHORE_SAD_EQ_SDD;
736	*cs++ = `0`;
737	*cs++ = preempt_address(engine: rq->engine);
738	*cs++ = `0`;
739	*cs++ = `0`;
740
741	return cs;
742	}
743
744	/ Wa_14014475959:dg2 /
745	/ Wa_16019325821 /
746	/ Wa_14019159160 /
747	#define HOLD_SWITCHOUT_SEMAPHORE_PPHWSP_OFFSET 0x540
748	static u32 hold_switchout_semaphore_offset(struct i915_request *rq)
749	{
750	return i915_ggtt_offset(vma: rq->context->state) +
751	(LRC_PPHWSP_PN * PAGE_SIZE) + HOLD_SWITCHOUT_SEMAPHORE_PPHWSP_OFFSET;
752	}
753
754	/ Wa_14014475959:dg2 /
755	/ Wa_16019325821 /
756	/ Wa_14019159160 /
757	static u32 hold_switchout_emit_wa_busywait(struct* i915_request rq, u32 cs)
758	{
759	int i;
760
761	*cs++ = MI_ATOMIC_INLINE \| MI_ATOMIC_GLOBAL_GTT \| MI_ATOMIC_CS_STALL \|
762	MI_ATOMIC_MOVE;
763	*cs++ = hold_switchout_semaphore_offset(rq);
764	*cs++ = `0`;
765	*cs++ = `1`;
766
767	/*
768	* When MI_ATOMIC_INLINE_DATA set this command must be 11 DW + (1 NOP)
769	* to align. 4 DWs above + 8 filler DWs here.
770	*/
771	for (i = `0`; i < `8`; ++i)
772	*cs++ = `0`;
773
774	*cs++ = MI_SEMAPHORE_WAIT \|
775	MI_SEMAPHORE_GLOBAL_GTT \|
776	MI_SEMAPHORE_POLL \|
777	MI_SEMAPHORE_SAD_EQ_SDD;
778	*cs++ = `0`;
779	*cs++ = hold_switchout_semaphore_offset(rq);
780	*cs++ = `0`;
781
782	return cs;
783	}
784
785	static __always_inline u32*
786	gen12_emit_fini_breadcrumb_tail(struct i915_request rq, u32 cs)
787	{
788	*cs++ = MI_USER_INTERRUPT;
789
790	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
791	if (intel_engine_has_semaphores(engine: rq->engine) &&
792	!intel_uc_uses_guc_submission(uc: &rq->engine->gt->uc))
793	cs = gen12_emit_preempt_busywait(rq, cs);
794
795	/ Wa_14014475959:dg2 /
796	/ Wa_16019325821 /
797	/ Wa_14019159160 /
798	if (intel_engine_uses_wa_hold_switchout(engine: rq->engine))
799	cs = hold_switchout_emit_wa_busywait(rq, cs);
800
801	rq->tail = intel_ring_offset(rq, addr: cs);
802	assert_ring_tail_valid(ring: rq->ring, tail: rq->tail);
803
804	return gen8_emit_wa_tail(rq, cs);
805	}
806
807	u32 gen12_emit_fini_breadcrumb_xcs(struct* i915_request rq, u32 cs)
808	{
809	/ XXX Stalling flush before seqno write; post-sync not /
810	cs = emit_xcs_breadcrumb(rq, cs: __gen8_emit_flush_dw(cs, value: `0`, gtt_offset: `0`, flags: `0`));
811	return gen12_emit_fini_breadcrumb_tail(rq, cs);
812	}
813
814	u32 gen12_emit_fini_breadcrumb_rcs(struct* i915_request rq, u32 cs)
815	{
816	struct drm_i915_private *i915 = rq->i915;
817	struct intel_gt *gt = rq->engine->gt;
818	u32 flags = (PIPE_CONTROL_CS_STALL \|
819	PIPE_CONTROL_TLB_INVALIDATE \|
820	PIPE_CONTROL_TILE_CACHE_FLUSH \|
821	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
822	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
823	PIPE_CONTROL_DC_FLUSH_ENABLE \|
824	PIPE_CONTROL_FLUSH_ENABLE);
825
826	if (GRAPHICS_VER_FULL(rq->i915) < IP_VER(`12`, `70`))
827	flags \|= PIPE_CONTROL_FLUSH_L3;
828
829	/ Wa_14016712196 /
830	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(`12`, `70`), IP_VER(`12`, `74`)) \|\| IS_DG2(i915))
831	/ dummy PIPE_CONTROL + depth flush /
832	cs = gen12_emit_pipe_control(batch: cs, bit_group_0: `0`,
833	PIPE_CONTROL_DEPTH_CACHE_FLUSH, offset: `0`);
834
835	if (GRAPHICS_VER(i915) == `12` && GRAPHICS_VER_FULL(i915) < IP_VER(`12`, `55`))
836	/ Wa_1409600907 /
837	flags \|= PIPE_CONTROL_DEPTH_STALL;
838
839	if (!HAS_3D_PIPELINE(rq->i915))
840	flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
841	else if (rq->engine->class == COMPUTE_CLASS)
842	flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
843
844	cs = gen12_emit_pipe_control(batch: cs, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, bit_group_1: flags, offset: `0`);
845
846	/XXX: Look at gen8_emit_fini_breadcrumb_rcs /
847	cs = gen12_emit_ggtt_write_rcs(cs,
848	value: rq->fence.seqno,
849	gtt_offset: hwsp_offset(rq),
850	flags0: `0`,
851	PIPE_CONTROL_FLUSH_ENABLE \|
852	PIPE_CONTROL_CS_STALL);
853
854	return gen12_emit_fini_breadcrumb_tail(rq, cs);
855	}
856

source code of linux/drivers/gpu/drm/i915/gt/gen8_engine_cs.c