gen7_renderclear.c source code [linux/drivers/gpu/drm/i915/gt/gen7_renderclear.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2019 Intel Corporation
4	*/
5
6	#include "gen7_renderclear.h"
7	#include "i915_drv.h"
8	#include "intel_gpu_commands.h"
9	#include "intel_gt_regs.h"
10
11	#define GT3_INLINE_DATA_DELAYS 0x1E00
12	#define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
13
14	struct cb_kernel {
15	const void *data;
16	u32 size;
17	};
18
19	#define CB_KERNEL(name) { .data = (name), .size = sizeof(name) }
20
21	#include "ivb_clear_kernel.c"
22	static const struct cb_kernel cb_kernel_ivb = CB_KERNEL(ivb_clear_kernel);
23
24	#include "hsw_clear_kernel.c"
25	static const struct cb_kernel cb_kernel_hsw = CB_KERNEL(hsw_clear_kernel);
26
27	struct batch_chunk {
28	struct i915_vma *vma;
29	u32 offset;
30	u32 *start;
31	u32 *end;
32	u32 max_items;
33	};
34
35	struct batch_vals {
36	u32 max_threads;
37	u32 state_start;
38	u32 surface_start;
39	u32 surface_height;
40	u32 surface_width;
41	u32 size;
42	};
43
44	static int num_primitives(const struct batch_vals *bv)
45	{
46	/*
47	* We need to saturate the GPU with work in order to dispatch
48	* a shader on every HW thread, and clear the thread-local registers.
49	* In short, we have to dispatch work faster than the shaders can
50	* run in order to fill the EU and occupy each HW thread.
51	*/
52	return bv->max_threads;
53	}
54
55	static void
56	batch_get_defaults(struct drm_i915_private i915, struct* batch_vals *bv)
57	{
58	if (IS_HASWELL(i915)) {
59	switch (INTEL_INFO(i915)->gt) {
60	default:
61	case `1`:
62	bv->max_threads = `70`;
63	break;
64	case `2`:
65	bv->max_threads = `140`;
66	break;
67	case `3`:
68	bv->max_threads = `280`;
69	break;
70	}
71	bv->surface_height = `16` * `16`;
72	bv->surface_width = `32` * `2` * `16`;
73	} else {
74	switch (INTEL_INFO(i915)->gt) {
75	default:
76	case `1`: / including vlv /
77	bv->max_threads = `36`;
78	break;
79	case `2`:
80	bv->max_threads = `128`;
81	break;
82	}
83	bv->surface_height = `16` * `8`;
84	bv->surface_width = `32` * `16`;
85	}
86	bv->state_start = round_up(SZ_1K + num_primitives(bv) * `64`, SZ_4K);
87	bv->surface_start = bv->state_start + SZ_4K;
88	bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
89	}
90
91	static void batch_init(struct batch_chunk *bc,
92	struct i915_vma *vma,
93	u32 *start, u32 offset, u32 max_bytes)
94	{
95	bc->vma = vma;
96	bc->offset = offset;
97	bc->start = start + bc->offset / sizeof(*bc->start);
98	bc->end = bc->start;
99	bc->max_items = max_bytes / sizeof(*bc->start);
100	}
101
102	static u32 batch_offset(const struct batch_chunk bc, u32 cs)
103	{
104	return (cs - bc->start) * sizeof(*bc->start) + bc->offset;
105	}
106
107	static u32 batch_addr(const struct batch_chunk *bc)
108	{
109	return i915_vma_offset(vma: bc->vma);
110	}
111
112	static void batch_add(struct batch_chunk bc, const* u32 d)
113	{
114	GEM_BUG_ON((bc->end - bc->start) >= bc->max_items);
115	*bc->end++ = d;
116	}
117
118	static u32 batch_alloc_items(struct* batch_chunk *bc, u32 align, u32 items)
119	{
120	u32 *map;
121
122	if (align) {
123	u32 *end = PTR_ALIGN(bc->end, align);
124
125	memset32(s: bc->end, v: `0`, n: end - bc->end);
126	bc->end = end;
127	}
128
129	map = bc->end;
130	bc->end += items;
131
132	return map;
133	}
134
135	static u32 batch_alloc_bytes(struct* batch_chunk *bc, u32 align, u32 bytes)
136	{
137	GEM_BUG_ON(!IS_ALIGNED(bytes, sizeof(*bc->start)));
138	return batch_alloc_items(bc, align, items: bytes / sizeof(*bc->start));
139	}
140
141	static u32
142	gen7_fill_surface_state(struct batch_chunk *state,
143	const u32 dst_offset,
144	const struct batch_vals *bv)
145	{
146	u32 surface_h = bv->surface_height;
147	u32 surface_w = bv->surface_width;
148	u32 *cs = batch_alloc_items(bc: state, align: `32`, items: `8`);
149	u32 offset = batch_offset(bc: state, cs);
150
151	#define SURFACE_2D 1
152	#define SURFACEFORMAT_B8G8R8A8_UNORM 0x0C0
153	#define RENDER_CACHE_READ_WRITE 1
154
155	*cs++ = SURFACE_2D << `29` \|
156	(SURFACEFORMAT_B8G8R8A8_UNORM << `18`) \|
157	(RENDER_CACHE_READ_WRITE << `8`);
158
159	*cs++ = batch_addr(bc: state) + dst_offset;
160
161	*cs++ = ((surface_h / `4` - `1`) << `16`) \| (surface_w / `4` - `1`);
162	*cs++ = surface_w;
163	*cs++ = `0`;
164	*cs++ = `0`;
165	*cs++ = `0`;
166	#define SHADER_CHANNELS(r, g, b, a) \
167	(((r) << 25) \| ((g) << 22) \| ((b) << 19) \| ((a) << 16))
168	*cs++ = SHADER_CHANNELS(`4`, `5`, `6`, `7`);
169	batch_advance(state, cs);
170
171	return offset;
172	}
173
174	static u32
175	gen7_fill_binding_table(struct batch_chunk *state,
176	const struct batch_vals *bv)
177	{
178	u32 surface_start =
179	gen7_fill_surface_state(state, dst_offset: bv->surface_start, bv);
180	u32 *cs = batch_alloc_items(bc: state, align: `32`, items: `8`);
181	u32 offset = batch_offset(bc: state, cs);
182
183	*cs++ = surface_start - state->offset;
184	*cs++ = `0`;
185	*cs++ = `0`;
186	*cs++ = `0`;
187	*cs++ = `0`;
188	*cs++ = `0`;
189	*cs++ = `0`;
190	*cs++ = `0`;
191	batch_advance(state, cs);
192
193	return offset;
194	}
195
196	static u32
197	gen7_fill_kernel_data(struct batch_chunk *state,
198	const u32 *data,
199	const u32 size)
200	{
201	return batch_offset(bc: state,
202	memcpy(batch_alloc_bytes(state, `64`, size),
203	data, size));
204	}
205
206	static u32
207	gen7_fill_interface_descriptor(struct batch_chunk *state,
208	const struct batch_vals *bv,
209	const struct cb_kernel *kernel,
210	unsigned int count)
211	{
212	u32 kernel_offset =
213	gen7_fill_kernel_data(state, data: kernel->data, size: kernel->size);
214	u32 binding_table = gen7_fill_binding_table(state, bv);
215	u32 cs = batch_alloc_items(bc: state, align: `32`, items: `8` count);
216	u32 offset = batch_offset(bc: state, cs);
217
218	*cs++ = kernel_offset;
219	*cs++ = (`1` << `7`) \| (`1` << `13`);
220	*cs++ = `0`;
221	*cs++ = (binding_table - state->offset) \| `1`;
222	*cs++ = `0`;
223	*cs++ = `0`;
224	*cs++ = `0`;
225	*cs++ = `0`;
226
227	/ 1 - 63dummy idds /
228	memset32(s: cs, v: `0x00`, n: (count - `1`) * `8`);
229	batch_advance(state, cs + (count - `1`) * `8`);
230
231	return offset;
232	}
233
234	static void
235	gen7_emit_state_base_address(struct batch_chunk *batch,
236	u32 surface_state_base)
237	{
238	u32 *cs = batch_alloc_items(bc: batch, align: `0`, items: `10`);
239
240	*cs++ = STATE_BASE_ADDRESS \| (`10` - `2`);
241	/ general /
242	*cs++ = batch_addr(bc: batch) \| BASE_ADDRESS_MODIFY;
243	/ surface /
244	*cs++ = (batch_addr(bc: batch) + surface_state_base) \| BASE_ADDRESS_MODIFY;
245	/ dynamic /
246	*cs++ = batch_addr(bc: batch) \| BASE_ADDRESS_MODIFY;
247	/ indirect /
248	*cs++ = batch_addr(bc: batch) \| BASE_ADDRESS_MODIFY;
249	/ instruction /
250	*cs++ = batch_addr(bc: batch) \| BASE_ADDRESS_MODIFY;
251
252	/ general/dynamic/indirect/instruction access Bound /
253	*cs++ = `0`;
254	*cs++ = BASE_ADDRESS_MODIFY;
255	*cs++ = `0`;
256	*cs++ = BASE_ADDRESS_MODIFY;
257	batch_advance(batch, cs);
258	}
259
260	static void
261	gen7_emit_vfe_state(struct batch_chunk *batch,
262	const struct batch_vals *bv,
263	u32 urb_size, u32 curbe_size,
264	u32 mode)
265	{
266	u32 threads = bv->max_threads - `1`;
267	u32 *cs = batch_alloc_items(bc: batch, align: `32`, items: `8`);
268
269	*cs++ = MEDIA_VFE_STATE \| (`8` - `2`);
270
271	/ scratch buffer /
272	*cs++ = `0`;
273
274	/ number of threads & urb entries for GPGPU vs Media Mode /
275	*cs++ = threads << `16` \| `1` << `8` \| mode << `2`;
276
277	*cs++ = `0`;
278
279	/ urb entry size & curbe size in 256 bits unit /
280	*cs++ = urb_size << `16` \| curbe_size;
281
282	/ scoreboard /
283	*cs++ = `0`;
284	*cs++ = `0`;
285	*cs++ = `0`;
286	batch_advance(batch, cs);
287	}
288
289	static void
290	gen7_emit_interface_descriptor_load(struct batch_chunk *batch,
291	const u32 interface_descriptor,
292	unsigned int count)
293	{
294	u32 *cs = batch_alloc_items(bc: batch, align: `8`, items: `4`);
295
296	*cs++ = MEDIA_INTERFACE_DESCRIPTOR_LOAD \| (`4` - `2`);
297	*cs++ = `0`;
298	cs++ = count `8` * sizeof(*cs);
299
300	/*
301	* interface descriptor address - it is relative to the dynamics base
302	* address
303	*/
304	*cs++ = interface_descriptor;
305	batch_advance(batch, cs);
306	}
307
308	static void
309	gen7_emit_media_object(struct batch_chunk *batch,
310	unsigned int media_object_index)
311	{
312	unsigned int x_offset = (media_object_index % `16`) * `64`;
313	unsigned int y_offset = (media_object_index / `16`) * `16`;
314	unsigned int pkt = `6` + `3`;
315	u32 *cs;
316
317	cs = batch_alloc_items(bc: batch, align: `8`, items: pkt);
318
319	*cs++ = MEDIA_OBJECT \| (pkt - `2`);
320
321	/ interface descriptor offset /
322	*cs++ = `0`;
323
324	/ without indirect data /
325	*cs++ = `0`;
326	*cs++ = `0`;
327
328	/ scoreboard /
329	*cs++ = `0`;
330	*cs++ = `0`;
331
332	/ inline /
333	*cs++ = y_offset << `16` \| x_offset;
334	*cs++ = `0`;
335	*cs++ = GT3_INLINE_DATA_DELAYS;
336
337	batch_advance(batch, cs);
338	}
339
340	static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
341	{
342	u32 *cs = batch_alloc_items(bc: batch, align: `0`, items: `4`);
343
344	*cs++ = GFX_OP_PIPE_CONTROL(`4`);
345	*cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
346	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
347	PIPE_CONTROL_DC_FLUSH_ENABLE \|
348	PIPE_CONTROL_CS_STALL;
349	*cs++ = `0`;
350	*cs++ = `0`;
351
352	batch_advance(batch, cs);
353	}
354
355	static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
356	{
357	u32 *cs = batch_alloc_items(bc: batch, align: `0`, items: `10`);
358
359	/ ivb: Stall before STATE_CACHE_INVALIDATE /
360	*cs++ = GFX_OP_PIPE_CONTROL(`5`);
361	*cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD \|
362	PIPE_CONTROL_CS_STALL;
363	*cs++ = `0`;
364	*cs++ = `0`;
365	*cs++ = `0`;
366
367	*cs++ = GFX_OP_PIPE_CONTROL(`5`);
368	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
369	*cs++ = `0`;
370	*cs++ = `0`;
371	*cs++ = `0`;
372
373	batch_advance(batch, cs);
374	}
375
376	static void emit_batch(struct i915_vma * const vma,
377	u32 *start,
378	const struct batch_vals *bv)
379	{
380	struct drm_i915_private *i915 = vma->vm->i915;
381	const unsigned int desc_count = `1`;
382	const unsigned int urb_size = `1`;
383	struct batch_chunk cmds, state;
384	u32 descriptors;
385	unsigned int i;
386
387	batch_init(bc: &cmds, vma, start, offset: `0`, max_bytes: bv->state_start);
388	batch_init(bc: &state, vma, start, offset: bv->state_start, SZ_4K);
389
390	descriptors = gen7_fill_interface_descriptor(state: &state, bv,
391	IS_HASWELL(i915) ?
392	&cb_kernel_hsw :
393	&cb_kernel_ivb,
394	count: desc_count);
395
396	/ Reset inherited context registers /
397	gen7_emit_pipeline_flush(batch: &cmds);
398	gen7_emit_pipeline_invalidate(batch: &cmds);
399	batch_add(bc: &cmds, MI_LOAD_REGISTER_IMM(`2`));
400	batch_add(bc: &cmds, i915_mmio_reg_offset(CACHE_MODE_0_GEN7));
401	batch_add(bc: &cmds, d: `0xffff0000` \|
402	(((IS_IVYBRIDGE(i915) && INTEL_INFO(i915)->gt == `1`) \|\|
403	IS_VALLEYVIEW(i915)) ?
404	HIZ_RAW_STALL_OPT_DISABLE :
405	`0`));
406	batch_add(bc: &cmds, i915_mmio_reg_offset(CACHE_MODE_1));
407	batch_add(bc: &cmds, d: `0xffff0000` \| PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
408	gen7_emit_pipeline_invalidate(batch: &cmds);
409	gen7_emit_pipeline_flush(batch: &cmds);
410
411	/ Switch to the media pipeline and our base address /
412	gen7_emit_pipeline_invalidate(batch: &cmds);
413	batch_add(bc: &cmds, PIPELINE_SELECT \| PIPELINE_SELECT_MEDIA);
414	batch_add(bc: &cmds, MI_NOOP);
415	gen7_emit_pipeline_invalidate(batch: &cmds);
416
417	gen7_emit_pipeline_flush(batch: &cmds);
418	gen7_emit_state_base_address(batch: &cmds, surface_state_base: descriptors);
419	gen7_emit_pipeline_invalidate(batch: &cmds);
420
421	/ Set the clear-residual kernel state /
422	gen7_emit_vfe_state(batch: &cmds, bv, urb_size: urb_size - `1`, curbe_size: `0`, mode: `0`);
423	gen7_emit_interface_descriptor_load(batch: &cmds, interface_descriptor: descriptors, count: desc_count);
424
425	/ Execute the kernel on all HW threads /
426	for (i = `0`; i < num_primitives(bv); i++)
427	gen7_emit_media_object(batch: &cmds, media_object_index: i);
428
429	batch_add(bc: &cmds, MI_BATCH_BUFFER_END);
430	}
431
432	int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
433	struct i915_vma * const vma)
434	{
435	struct batch_vals bv;
436	u32 *batch;
437
438	batch_get_defaults(i915: engine->i915, bv: &bv);
439	if (!vma)
440	return bv.size;
441
442	GEM_BUG_ON(vma->obj->base.size < bv.size);
443
444	batch = i915_gem_object_pin_map(obj: vma->obj, type: I915_MAP_WC);
445	if (IS_ERR(ptr: batch))
446	return PTR_ERR(ptr: batch);
447
448	emit_batch(vma, memset(batch, `0`, bv.size), bv: &bv);
449
450	i915_gem_object_flush_map(obj: vma->obj);
451	__i915_gem_object_release_map(obj: vma->obj);
452
453	return `0`;
454	}
455

source code of linux/drivers/gpu/drm/i915/gt/gen7_renderclear.c