intel_ggtt_fencing.c source code [linux/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2008-2015 Intel Corporation
4	*/
5
6	#include <linux/highmem.h>
7
8	#include <drm/drm_print.h>
9
10	#include "display/intel_display.h"
11	#include "i915_drv.h"
12	#include "i915_reg.h"
13	#include "i915_scatterlist.h"
14	#include "i915_pvinfo.h"
15	#include "i915_vgpu.h"
16	#include "intel_gt_regs.h"
17	#include "intel_mchbar_regs.h"
18
19	/**
20	* DOC: fence register handling
21	*
22	* Important to avoid confusions: "fences" in the i915 driver are not execution
23	* fences used to track command completion but hardware detiler objects which
24	* wrap a given range of the global GTT. Each platform has only a fairly limited
25	* set of these objects.
26	*
27	* Fences are used to detile GTT memory mappings. They're also connected to the
28	* hardware frontbuffer render tracking and hence interact with frontbuffer
29	* compression. Furthermore on older platforms fences are required for tiled
30	* objects used by the display engine. They can also be used by the render
31	* engine - they're required for blitter commands and are optional for render
32	* commands. But on gen4+ both display (with the exception of fbc) and rendering
33	* have their own tiling state bits and don't need fences.
34	*
35	* Also note that fences only support X and Y tiling and hence can't be used for
36	* the fancier new tiling formats like W, Ys and Yf.
37	*
38	* Finally note that because fences are such a restricted resource they're
39	* dynamically associated with objects. Furthermore fence state is committed to
40	* the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must
41	* explicitly call i915_gem_object_get_fence() to synchronize fencing status
42	* for cpu access. Also note that some code wants an unfenced view, for those
43	* cases the fence can be removed forcefully with i915_gem_object_put_fence().
44	*
45	* Internally these functions will synchronize with userspace access by removing
46	* CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
47	*/
48
49	#define pipelined 0
50
51	static struct drm_i915_private fence_to_i915(struct* i915_fence_reg *fence)
52	{
53	return fence->ggtt->vm.i915;
54	}
55
56	static struct intel_uncore fence_to_uncore(struct* i915_fence_reg *fence)
57	{
58	return fence->ggtt->vm.gt->uncore;
59	}
60
61	static void i965_write_fence_reg(struct i915_fence_reg *fence)
62	{
63	i915_reg_t fence_reg_lo, fence_reg_hi;
64	int fence_pitch_shift;
65	u64 val;
66
67	if (GRAPHICS_VER(fence_to_i915(fence)) >= `6`) {
68	fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
69	fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
70	fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
71
72	} else {
73	fence_reg_lo = FENCE_REG_965_LO(fence->id);
74	fence_reg_hi = FENCE_REG_965_HI(fence->id);
75	fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
76	}
77
78	val = `0`;
79	if (fence->tiling) {
80	unsigned int stride = fence->stride;
81
82	GEM_BUG_ON(!IS_ALIGNED(stride, `128`));
83
84	val = fence->start + fence->size - I965_FENCE_PAGE;
85	val <<= `32`;
86	val \|= fence->start;
87	val \|= (u64)((stride / `128`) - `1`) << fence_pitch_shift;
88	if (fence->tiling == I915_TILING_Y)
89	val \|= BIT(I965_FENCE_TILING_Y_SHIFT);
90	val \|= I965_FENCE_REG_VALID;
91	}
92
93	if (!pipelined) {
94	struct intel_uncore *uncore = fence_to_uncore(fence);
95
96	/*
97	* To w/a incoherency with non-atomic 64-bit register updates,
98	* we split the 64-bit update into two 32-bit writes. In order
99	* for a partial fence not to be evaluated between writes, we
100	* precede the update with write to turn off the fence register,
101	* and only enable the fence as the last step.
102	*
103	* For extra levels of paranoia, we make sure each step lands
104	* before applying the next step.
105	*/
106	intel_uncore_write_fw(uncore, fence_reg_lo, `0`);
107	intel_uncore_posting_read_fw(uncore, fence_reg_lo);
108
109	intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val));
110	intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val));
111	intel_uncore_posting_read_fw(uncore, fence_reg_lo);
112	}
113	}
114
115	static void i915_write_fence_reg(struct i915_fence_reg *fence)
116	{
117	u32 val;
118
119	val = `0`;
120	if (fence->tiling) {
121	unsigned int stride = fence->stride;
122	unsigned int tiling = fence->tiling;
123	bool is_y_tiled = tiling == I915_TILING_Y;
124
125	if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence)))
126	stride /= `128`;
127	else
128	stride /= `512`;
129	GEM_BUG_ON(!is_power_of_2(stride));
130
131	val = fence->start;
132	if (is_y_tiled)
133	val \|= BIT(I830_FENCE_TILING_Y_SHIFT);
134	val \|= I915_FENCE_SIZE_BITS(fence->size);
135	val \|= ilog2(stride) << I830_FENCE_PITCH_SHIFT;
136
137	val \|= I830_FENCE_REG_VALID;
138	}
139
140	if (!pipelined) {
141	struct intel_uncore *uncore = fence_to_uncore(fence);
142	i915_reg_t reg = FENCE_REG(fence->id);
143
144	intel_uncore_write_fw(uncore, reg, val);
145	intel_uncore_posting_read_fw(uncore, reg);
146	}
147	}
148
149	static void i830_write_fence_reg(struct i915_fence_reg *fence)
150	{
151	u32 val;
152
153	val = `0`;
154	if (fence->tiling) {
155	unsigned int stride = fence->stride;
156
157	val = fence->start;
158	if (fence->tiling == I915_TILING_Y)
159	val \|= BIT(I830_FENCE_TILING_Y_SHIFT);
160	val \|= I830_FENCE_SIZE_BITS(fence->size);
161	val \|= ilog2(stride / `128`) << I830_FENCE_PITCH_SHIFT;
162	val \|= I830_FENCE_REG_VALID;
163	}
164
165	if (!pipelined) {
166	struct intel_uncore *uncore = fence_to_uncore(fence);
167	i915_reg_t reg = FENCE_REG(fence->id);
168
169	intel_uncore_write_fw(uncore, reg, val);
170	intel_uncore_posting_read_fw(uncore, reg);
171	}
172	}
173
174	static void fence_write(struct i915_fence_reg *fence)
175	{
176	struct drm_i915_private *i915 = fence_to_i915(fence);
177
178	/*
179	* Previous access through the fence register is marshalled by
180	* the mb() inside the fault handlers (i915_gem_release_mmaps)
181	* and explicitly managed for internal users.
182	*/
183
184	if (GRAPHICS_VER(i915) == `2`)
185	i830_write_fence_reg(fence);
186	else if (GRAPHICS_VER(i915) == `3`)
187	i915_write_fence_reg(fence);
188	else
189	i965_write_fence_reg(fence);
190
191	/*
192	* Access through the fenced region afterwards is
193	* ordered by the posting reads whilst writing the registers.
194	*/
195	}
196
197	static bool gpu_uses_fence_registers(struct i915_fence_reg *fence)
198	{
199	return GRAPHICS_VER(fence_to_i915(fence)) < `4`;
200	}
201
202	static int fence_update(struct i915_fence_reg *fence,
203	struct i915_vma *vma)
204	{
205	struct i915_ggtt *ggtt = fence->ggtt;
206	struct intel_uncore *uncore = fence_to_uncore(fence);
207	intel_wakeref_t wakeref;
208	struct i915_vma *old;
209	int ret;
210
211	fence->tiling = `0`;
212	if (vma) {
213	GEM_BUG_ON(!i915_gem_object_get_stride(vma->obj) \|\|
214	!i915_gem_object_get_tiling(vma->obj));
215
216	if (!i915_vma_is_map_and_fenceable(vma))
217	return -EINVAL;
218
219	if (gpu_uses_fence_registers(fence)) {
220	/ implicit 'unfenced' GPU blits /
221	ret = i915_vma_sync(vma);
222	if (ret)
223	return ret;
224	}
225
226	GEM_BUG_ON(vma->fence_size > i915_vma_size(vma));
227	fence->start = i915_ggtt_offset(vma);
228	fence->size = vma->fence_size;
229	fence->stride = i915_gem_object_get_stride(obj: vma->obj);
230	fence->tiling = i915_gem_object_get_tiling(obj: vma->obj);
231	}
232	WRITE_ONCE(fence->dirty, false);
233
234	old = xchg(&fence->vma, NULL);
235	if (old) {
236	/ XXX Ideally we would move the waiting to outside the mutex /
237	ret = i915_active_wait(ref: &fence->active);
238	if (ret) {
239	fence->vma = old;
240	return ret;
241	}
242
243	i915_vma_flush_writes(vma: old);
244
245	/*
246	* Ensure that all userspace CPU access is completed before
247	* stealing the fence.
248	*/
249	if (old != vma) {
250	GEM_BUG_ON(old->fence != fence);
251	i915_vma_revoke_mmap(vma: old);
252	old->fence = NULL;
253	}
254
255	list_move(list: &fence->link, head: &ggtt->fence_list);
256	}
257
258	/*
259	* We only need to update the register itself if the device is awake.
260	* If the device is currently powered down, we will defer the write
261	* to the runtime resume, see intel_ggtt_restore_fences().
262	*
263	* This only works for removing the fence register, on acquisition
264	* the caller must hold the rpm wakeref. The fence register must
265	* be cleared before we can use any other fences to ensure that
266	* the new fences do not overlap the elided clears, confusing HW.
267	*/
268	wakeref = intel_runtime_pm_get_if_in_use(rpm: uncore->rpm);
269	if (!wakeref) {
270	GEM_BUG_ON(vma);
271	return `0`;
272	}
273
274	WRITE_ONCE(fence->vma, vma);
275	fence_write(fence);
276
277	if (vma) {
278	vma->fence = fence;
279	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
280	}
281
282	intel_runtime_pm_put(rpm: uncore->rpm, wref: wakeref);
283	return `0`;
284	}
285
286	/**
287	* i915_vma_revoke_fence - force-remove fence for a VMA
288	* @vma: vma to map linearly (not through a fence reg)
289	*
290	* This function force-removes any fence from the given object, which is useful
291	* if the kernel wants to do untiled GTT access.
292	*/
293	void i915_vma_revoke_fence(struct i915_vma *vma)
294	{
295	struct i915_fence_reg *fence = vma->fence;
296	intel_wakeref_t wakeref;
297
298	lockdep_assert_held(&vma->vm->mutex);
299	if (!fence)
300	return;
301
302	GEM_BUG_ON(fence->vma != vma);
303	i915_active_wait(ref: &fence->active);
304	GEM_BUG_ON(!i915_active_is_idle(&fence->active));
305	GEM_BUG_ON(atomic_read(&fence->pin_count));
306
307	fence->tiling = `0`;
308	WRITE_ONCE(fence->vma, NULL);
309	vma->fence = NULL;
310
311	/*
312	* Skip the write to HW if and only if the device is currently
313	* suspended.
314	*
315	* If the driver does not currently hold a wakeref (if_in_use == 0),
316	* the device may currently be runtime suspended, or it may be woken
317	* up before the suspend takes place. If the device is not suspended
318	* (powered down) and we skip clearing the fence register, the HW is
319	* left in an undefined state where we may end up with multiple
320	* registers overlapping.
321	*/
322	with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref)
323	fence_write(fence);
324	}
325
326	static bool fence_is_active(const struct i915_fence_reg *fence)
327	{
328	return fence->vma && i915_vma_is_active(vma: fence->vma);
329	}
330
331	static struct i915_fence_reg fence_find(struct* i915_ggtt *ggtt)
332	{
333	struct intel_display *display = ggtt->vm.i915->display;
334	struct i915_fence_reg *active = NULL;
335	struct i915_fence_reg fence, fn;
336
337	list_for_each_entry_safe(fence, fn, &ggtt->fence_list, link) {
338	GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
339
340	if (fence == active) / now seen this fence twice /
341	active = ERR_PTR(error: -EAGAIN);
342
343	/ Prefer idle fences so we do not have to wait on the GPU /
344	if (active != ERR_PTR(error: -EAGAIN) && fence_is_active(fence)) {
345	if (!active)
346	active = fence;
347
348	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
349	continue;
350	}
351
352	if (atomic_read(v: &fence->pin_count))
353	continue;
354
355	return fence;
356	}
357
358	/ Wait for completion of pending flips which consume fences /
359	if (intel_has_pending_fb_unpin(display))
360	return ERR_PTR(error: -EAGAIN);
361
362	return ERR_PTR(error: -ENOBUFS);
363	}
364
365	int __i915_vma_pin_fence(struct i915_vma *vma)
366	{
367	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm: vma->vm);
368	struct i915_fence_reg *fence;
369	struct i915_vma *set = i915_gem_object_is_tiled(obj: vma->obj) ? vma : NULL;
370	int err;
371
372	lockdep_assert_held(&vma->vm->mutex);
373
374	/ Just update our place in the LRU if our fence is getting reused. /
375	if (vma->fence) {
376	fence = vma->fence;
377	GEM_BUG_ON(fence->vma != vma);
378	atomic_inc(v: &fence->pin_count);
379	if (!fence->dirty) {
380	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
381	return `0`;
382	}
383	} else if (set) {
384	fence = fence_find(ggtt);
385	if (IS_ERR(ptr: fence))
386	return PTR_ERR(ptr: fence);
387
388	GEM_BUG_ON(atomic_read(&fence->pin_count));
389	atomic_inc(v: &fence->pin_count);
390	} else {
391	return `0`;
392	}
393
394	err = fence_update(fence, vma: set);
395	if (err)
396	goto out_unpin;
397
398	GEM_BUG_ON(fence->vma != set);
399	GEM_BUG_ON(vma->fence != (set ? fence : NULL));
400
401	if (set)
402	return `0`;
403
404	out_unpin:
405	atomic_dec(v: &fence->pin_count);
406	return err;
407	}
408
409	/**
410	* i915_vma_pin_fence - set up fencing for a vma
411	* @vma: vma to map through a fence reg
412	*
413	* When mapping objects through the GTT, userspace wants to be able to write
414	* to them without having to worry about swizzling if the object is tiled.
415	* This function walks the fence regs looking for a free one for @obj,
416	* stealing one if it can't find any.
417	*
418	* It then sets up the reg based on the object's properties: address, pitch
419	* and tiling format.
420	*
421	* For an untiled surface, this removes any existing fence.
422	*
423	* Returns:
424	* 0 on success, negative error code on failure.
425	*/
426	int i915_vma_pin_fence(struct i915_vma *vma)
427	{
428	int err;
429
430	if (!vma->fence && !i915_gem_object_is_tiled(obj: vma->obj))
431	return `0`;
432
433	/*
434	* Note that we revoke fences on runtime suspend. Therefore the user
435	* must keep the device awake whilst using the fence.
436	*/
437	assert_rpm_wakelock_held(rpm: vma->vm->gt->uncore->rpm);
438	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
439
440	err = mutex_lock_interruptible(&vma->vm->mutex);
441	if (err)
442	return err;
443
444	err = __i915_vma_pin_fence(vma);
445	mutex_unlock(lock: &vma->vm->mutex);
446
447	return err;
448	}
449
450	/**
451	* i915_reserve_fence - Reserve a fence for vGPU
452	* @ggtt: Global GTT
453	*
454	* This function walks the fence regs looking for a free one and remove
455	* it from the fence_list. It is used to reserve fence for vGPU to use.
456	*/
457	struct i915_fence_reg i915_reserve_fence(struct* i915_ggtt *ggtt)
458	{
459	struct i915_fence_reg *fence;
460	int count;
461	int ret;
462
463	lockdep_assert_held(&ggtt->vm.mutex);
464
465	/ Keep at least one fence available for the display engine. /
466	count = `0`;
467	list_for_each_entry(fence, &ggtt->fence_list, link)
468	count += !atomic_read(v: &fence->pin_count);
469	if (count <= `1`)
470	return ERR_PTR(error: -ENOSPC);
471
472	fence = fence_find(ggtt);
473	if (IS_ERR(ptr: fence))
474	return fence;
475
476	if (fence->vma) {
477	/ Force-remove fence from VMA /
478	ret = fence_update(fence, NULL);
479	if (ret)
480	return ERR_PTR(error: ret);
481	}
482
483	list_del(entry: &fence->link);
484
485	return fence;
486	}
487
488	/**
489	* i915_unreserve_fence - Reclaim a reserved fence
490	* @fence: the fence reg
491	*
492	* This function add a reserved fence register from vGPU to the fence_list.
493	*/
494	void i915_unreserve_fence(struct i915_fence_reg *fence)
495	{
496	struct i915_ggtt *ggtt = fence->ggtt;
497
498	lockdep_assert_held(&ggtt->vm.mutex);
499
500	list_add(new: &fence->link, head: &ggtt->fence_list);
501	}
502
503	/**
504	* intel_ggtt_restore_fences - restore fence state
505	* @ggtt: Global GTT
506	*
507	* Restore the hw fence state to match the software tracking again, to be called
508	* after a gpu reset and on resume. Note that on runtime suspend we only cancel
509	* the fences, to be reacquired by the user later.
510	*/
511	void intel_ggtt_restore_fences(struct i915_ggtt *ggtt)
512	{
513	int i;
514
515	for (i = `0`; i < ggtt->num_fences; i++)
516	fence_write(fence: &ggtt->fence_regs[i]);
517	}
518
519	/**
520	* DOC: tiling swizzling details
521	*
522	* The idea behind tiling is to increase cache hit rates by rearranging
523	* pixel data so that a group of pixel accesses are in the same cacheline.
524	* Performance improvement from doing this on the back/depth buffer are on
525	* the order of 30%.
526	*
527	* Intel architectures make this somewhat more complicated, though, by
528	* adjustments made to addressing of data when the memory is in interleaved
529	* mode (matched pairs of DIMMS) to improve memory bandwidth.
530	* For interleaved memory, the CPU sends every sequential 64 bytes
531	* to an alternate memory channel so it can get the bandwidth from both.
532	*
533	* The GPU also rearranges its accesses for increased bandwidth to interleaved
534	* memory, and it matches what the CPU does for non-tiled. However, when tiled
535	* it does it a little differently, since one walks addresses not just in the
536	* X direction but also Y. So, along with alternating channels when bit
537	* 6 of the address flips, it also alternates when other bits flip -- Bits 9
538	* (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
539	* are common to both the 915 and 965-class hardware.
540	*
541	* The CPU also sometimes XORs in higher bits as well, to improve
542	* bandwidth doing strided access like we do so frequently in graphics. This
543	* is called "Channel XOR Randomization" in the MCH documentation. The result
544	* is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
545	* decode.
546	*
547	* All of this bit 6 XORing has an effect on our memory management,
548	* as we need to make sure that the 3d driver can correctly address object
549	* contents.
550	*
551	* If we don't have interleaved memory, all tiling is safe and no swizzling is
552	* required.
553	*
554	* When bit 17 is XORed in, we simply refuse to tile at all. Bit
555	* 17 is not just a page offset, so as we page an object out and back in,
556	* individual pages in it will have different bit 17 addresses, resulting in
557	* each 64 bytes being swapped with its neighbor!
558	*
559	* Otherwise, if interleaved, we have to tell the 3d driver what the address
560	* swizzling it needs to do is, since it's writing with the CPU to the pages
561	* (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
562	* pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
563	* required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
564	* to match what the GPU expects.
565	*/
566
567	/**
568	* detect_bit_6_swizzle - detect bit 6 swizzling pattern
569	* @ggtt: Global GGTT
570	*
571	* Detects bit 6 swizzling of address lookup between IGD access and CPU
572	* access through main memory.
573	*/
574	static void detect_bit_6_swizzle(struct i915_ggtt *ggtt)
575	{
576	struct intel_uncore *uncore = ggtt->vm.gt->uncore;
577	struct drm_i915_private *i915 = ggtt->vm.i915;
578	u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
579	u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
580
581	if (GRAPHICS_VER(i915) >= `8` \|\| IS_VALLEYVIEW(i915)) {
582	/*
583	* On BDW+, swizzling is not used. We leave the CPU memory
584	* controller in charge of optimizing memory accesses without
585	* the extra address manipulation GPU side.
586	*
587	* VLV and CHV don't have GPU swizzling.
588	*/
589	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
590	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
591	} else if (GRAPHICS_VER(i915) >= `6`) {
592	if (i915->preserve_bios_swizzle) {
593	if (intel_uncore_read(uncore, DISP_ARB_CTL) &
594	DISP_TILE_SURFACE_SWIZZLING) {
595	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
596	swizzle_y = I915_BIT_6_SWIZZLE_9;
597	} else {
598	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
599	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
600	}
601	} else {
602	u32 dimm_c0, dimm_c1;
603
604	dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0);
605	dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1);
606	dimm_c0 &= MAD_DIMM_A_SIZE_MASK \| MAD_DIMM_B_SIZE_MASK;
607	dimm_c1 &= MAD_DIMM_A_SIZE_MASK \| MAD_DIMM_B_SIZE_MASK;
608	/*
609	* Enable swizzling when the channels are populated
610	* with identically sized dimms. We don't need to check
611	* the 3rd channel because no cpu with gpu attached
612	* ships in that configuration. Also, swizzling only
613	* makes sense for 2 channels anyway.
614	*/
615	if (dimm_c0 == dimm_c1) {
616	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
617	swizzle_y = I915_BIT_6_SWIZZLE_9;
618	} else {
619	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
620	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
621	}
622	}
623	} else if (GRAPHICS_VER(i915) == `5`) {
624	/*
625	* On Ironlake whatever DRAM config, GPU always do
626	* same swizzling setup.
627	*/
628	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
629	swizzle_y = I915_BIT_6_SWIZZLE_9;
630	} else if (GRAPHICS_VER(i915) == `2`) {
631	/*
632	* As far as we know, the 865 doesn't have these bit 6
633	* swizzling issues.
634	*/
635	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
636	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
637	} else if (IS_G45(i915) \|\| IS_I965G(i915) \|\| IS_G33(i915)) {
638	/*
639	* The 965, G33, and newer, have a very flexible memory
640	* configuration. It will enable dual-channel mode
641	* (interleaving) on as much memory as it can, and the GPU
642	* will additionally sometimes enable different bit 6
643	* swizzling for tiled objects from the CPU.
644	*
645	* Here's what I found on the G965:
646	* slot fill memory size swizzling
647	* 0A 0B 1A 1B 1-ch 2-ch
648	* 512 0 0 0 512 0 O
649	* 512 0 512 0 16 1008 X
650	* 512 0 0 512 16 1008 X
651	* 0 512 0 512 16 1008 X
652	* 1024 1024 1024 0 2048 1024 O
653	*
654	* We could probably detect this based on either the DRB
655	* matching, which was the case for the swizzling required in
656	* the table above, or from the 1-ch value being less than
657	* the minimum size of a rank.
658	*
659	* Reports indicate that the swizzling actually
660	* varies depending upon page placement inside the
661	* channels, i.e. we see swizzled pages where the
662	* banks of memory are paired and unswizzled on the
663	* uneven portion, so leave that as unknown.
664	*/
665	if (intel_uncore_read16(uncore, C0DRB3_BW) ==
666	intel_uncore_read16(uncore, C1DRB3_BW)) {
667	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
668	swizzle_y = I915_BIT_6_SWIZZLE_9;
669	}
670	} else {
671	u32 dcc = intel_uncore_read(uncore, DCC);
672
673	/*
674	* On 9xx chipsets, channel interleave by the CPU is
675	* determined by DCC. For single-channel, neither the CPU
676	* nor the GPU do swizzling. For dual channel interleaved,
677	* the GPU's interleave is bit 9 and 10 for X tiled, and bit
678	* 9 for Y tiled. The CPU's interleave is independent, and
679	* can be based on either bit 11 (haven't seen this yet) or
680	* bit 17 (common).
681	*/
682	switch (dcc & DCC_ADDRESSING_MODE_MASK) {
683	case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
684	case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
685	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
686	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
687	break;
688	case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
689	if (dcc & DCC_CHANNEL_XOR_DISABLE) {
690	/*
691	* This is the base swizzling by the GPU for
692	* tiled buffers.
693	*/
694	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
695	swizzle_y = I915_BIT_6_SWIZZLE_9;
696	} else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == `0`) {
697	/ Bit 11 swizzling by the CPU in addition. /
698	swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
699	swizzle_y = I915_BIT_6_SWIZZLE_9_11;
700	} else {
701	/ Bit 17 swizzling by the CPU in addition. /
702	swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
703	swizzle_y = I915_BIT_6_SWIZZLE_9_17;
704	}
705	break;
706	}
707
708	/ check for L-shaped memory aka modified enhanced addressing /
709	if (GRAPHICS_VER(i915) == `4` &&
710	!(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) {
711	swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
712	swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
713	}
714
715	if (dcc == `0xffffffff`) {
716	drm_err(&i915->drm, "Couldn't read from MCHBAR. "
717	"Disabling tiling.\n");
718	swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
719	swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
720	}
721	}
722
723	if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN \|\|
724	swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) {
725	/*
726	* Userspace likes to explode if it sees unknown swizzling,
727	* so lie. We will finish the lie when reporting through
728	* the get-tiling-ioctl by reporting the physical swizzle
729	* mode as unknown instead.
730	*
731	* As we don't strictly know what the swizzling is, it may be
732	* bit17 dependent, and so we need to also prevent the pages
733	* from being moved.
734	*/
735	i915->gem_quirks \|= GEM_QUIRK_PIN_SWIZZLED_PAGES;
736	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
737	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
738	}
739
740	to_gt(i915)->ggtt->bit_6_swizzle_x = swizzle_x;
741	to_gt(i915)->ggtt->bit_6_swizzle_y = swizzle_y;
742	}
743
744	/*
745	* Swap every 64 bytes of this page around, to account for it having a new
746	* bit 17 of its physical address and therefore being interpreted differently
747	* by the GPU.
748	*/
749	static void swizzle_page(struct page *page)
750	{
751	char temp[`64`];
752	char *vaddr;
753	int i;
754
755	vaddr = kmap_local_page(page);
756
757	for (i = `0`; i < PAGE_SIZE; i += `128`) {
758	memcpy(temp, &vaddr[i], `64`);
759	memcpy(&vaddr[i], &vaddr[i + `64`], `64`);
760	memcpy(&vaddr[i + `64`], temp, `64`);
761	}
762
763	kunmap_local(vaddr);
764	}
765
766	/**
767	* i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
768	* @obj: i915 GEM buffer object
769	* @pages: the scattergather list of physical pages
770	*
771	* This function fixes up the swizzling in case any page frame number for this
772	* object has changed in bit 17 since that state has been saved with
773	* i915_gem_object_save_bit_17_swizzle().
774	*
775	* This is called when pinning backing storage again, since the kernel is free
776	* to move unpinned backing storage around (either by directly moving pages or
777	* by swapping them out and back in again).
778	*/
779	void
780	i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
781	struct sg_table *pages)
782	{
783	struct sgt_iter sgt_iter;
784	struct page *page;
785	int i;
786
787	if (obj->bit_17 == NULL)
788	return;
789
790	i = `0`;
791	for_each_sgt_page(page, sgt_iter, pages) {
792	char new_bit_17 = page_to_phys(page) >> `17`;
793
794	if ((new_bit_17 & `0x1`) != (test_bit(i, obj->bit_17) != `0`)) {
795	swizzle_page(page);
796	set_page_dirty(page);
797	}
798
799	i++;
800	}
801	}
802
803	/**
804	* i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
805	* @obj: i915 GEM buffer object
806	* @pages: the scattergather list of physical pages
807	*
808	* This function saves the bit 17 of each page frame number so that swizzling
809	* can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
810	* be called before the backing storage can be unpinned.
811	*/
812	void
813	i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
814	struct sg_table *pages)
815	{
816	const unsigned int page_count = obj->base.size >> PAGE_SHIFT;
817	struct sgt_iter sgt_iter;
818	struct page *page;
819	int i;
820
821	if (obj->bit_17 == NULL) {
822	obj->bit_17 = bitmap_zalloc(nbits: page_count, GFP_KERNEL);
823	if (obj->bit_17 == NULL) {
824	drm_err(obj->base.dev,
825	"Failed to allocate memory for bit 17 record\n");
826	return;
827	}
828	}
829
830	i = `0`;
831
832	for_each_sgt_page(page, sgt_iter, pages) {
833	if (page_to_phys(page) & (`1` << `17`))
834	__set_bit(i, obj->bit_17);
835	else
836	__clear_bit(i, obj->bit_17);
837	i++;
838	}
839	}
840
841	void intel_ggtt_init_fences(struct i915_ggtt *ggtt)
842	{
843	struct drm_i915_private *i915 = ggtt->vm.i915;
844	struct intel_uncore *uncore = ggtt->vm.gt->uncore;
845	int num_fences;
846	int i;
847
848	INIT_LIST_HEAD(list: &ggtt->fence_list);
849	INIT_LIST_HEAD(list: &ggtt->userfault_list);
850
851	detect_bit_6_swizzle(ggtt);
852
853	if (!i915_ggtt_has_aperture(ggtt))
854	num_fences = `0`;
855	else if (GRAPHICS_VER(i915) >= `7` &&
856	!(IS_VALLEYVIEW(i915) \|\| IS_CHERRYVIEW(i915)))
857	num_fences = `32`;
858	else if (GRAPHICS_VER(i915) >= `4` \|\|
859	IS_I945G(i915) \|\| IS_I945GM(i915) \|\|
860	IS_G33(i915) \|\| IS_PINEVIEW(i915))
861	num_fences = `16`;
862	else
863	num_fences = `8`;
864
865	if (intel_vgpu_active(i915))
866	num_fences = intel_uncore_read(uncore,
867	vgtif_reg(avail_rs.fence_num));
868	ggtt->fence_regs = kcalloc(num_fences,
869	sizeof(*ggtt->fence_regs),
870	GFP_KERNEL);
871	if (!ggtt->fence_regs)
872	num_fences = `0`;
873
874	/ Initialize fence registers to zero /
875	for (i = `0`; i < num_fences; i++) {
876	struct i915_fence_reg *fence = &ggtt->fence_regs[i];
877
878	i915_active_init(&fence->active, NULL, NULL, `0`);
879	fence->ggtt = ggtt;
880	fence->id = i;
881	list_add_tail(new: &fence->link, head: &ggtt->fence_list);
882	}
883	ggtt->num_fences = num_fences;
884
885	intel_ggtt_restore_fences(ggtt);
886	}
887
888	void intel_ggtt_fini_fences(struct i915_ggtt *ggtt)
889	{
890	int i;
891
892	for (i = `0`; i < ggtt->num_fences; i++) {
893	struct i915_fence_reg *fence = &ggtt->fence_regs[i];
894
895	i915_active_fini(ref: &fence->active);
896	}
897
898	kfree(objp: ggtt->fence_regs);
899	}
900
901	void intel_gt_init_swizzling(struct intel_gt *gt)
902	{
903	struct drm_i915_private *i915 = gt->i915;
904	struct intel_uncore *uncore = gt->uncore;
905
906	if (GRAPHICS_VER(i915) < `5` \|\|
907	to_gt(i915)->ggtt->bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
908	return;
909
910	intel_uncore_rmw(uncore, DISP_ARB_CTL, clear: `0`, DISP_TILE_SURFACE_SWIZZLING);
911
912	if (GRAPHICS_VER(i915) == `5`)
913	return;
914
915	intel_uncore_rmw(uncore, TILECTL, clear: `0`, TILECTL_SWZCTL);
916
917	if (GRAPHICS_VER(i915) == `6`)
918	intel_uncore_write(uncore,
919	ARB_MODE,
920	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
921	else if (GRAPHICS_VER(i915) == `7`)
922	intel_uncore_write(uncore,
923	ARB_MODE,
924	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
925	else if (GRAPHICS_VER(i915) == `8`)
926	intel_uncore_write(uncore,
927	GAMTARBMODE,
928	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
929	else
930	MISSING_CASE(GRAPHICS_VER(i915));
931	}
932

source code of linux/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c