| 1 | // SPDX-License-Identifier: MIT |
| 2 | /* |
| 3 | * Copyright © 2020 Intel Corporation |
| 4 | */ |
| 5 | |
| 6 | #include "gen2_engine_cs.h" |
| 7 | #include "i915_drv.h" |
| 8 | #include "i915_reg.h" |
| 9 | #include "intel_engine.h" |
| 10 | #include "intel_engine_regs.h" |
| 11 | #include "intel_gpu_commands.h" |
| 12 | #include "intel_gt.h" |
| 13 | #include "intel_gt_irq.h" |
| 14 | #include "intel_ring.h" |
| 15 | |
| 16 | int gen2_emit_flush(struct i915_request *rq, u32 mode) |
| 17 | { |
| 18 | unsigned int num_store_dw = 12; |
| 19 | u32 cmd, *cs; |
| 20 | |
| 21 | cmd = MI_FLUSH; |
| 22 | if (mode & EMIT_INVALIDATE) |
| 23 | cmd |= MI_READ_FLUSH; |
| 24 | |
| 25 | cs = intel_ring_begin(rq, num_dwords: 2 + 4 * num_store_dw); |
| 26 | if (IS_ERR(ptr: cs)) |
| 27 | return PTR_ERR(ptr: cs); |
| 28 | |
| 29 | *cs++ = cmd; |
| 30 | while (num_store_dw--) { |
| 31 | *cs++ = MI_STORE_DWORD_INDEX; |
| 32 | *cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32); |
| 33 | *cs++ = 0; |
| 34 | *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH; |
| 35 | } |
| 36 | *cs++ = cmd; |
| 37 | |
| 38 | intel_ring_advance(rq, cs); |
| 39 | |
| 40 | return 0; |
| 41 | } |
| 42 | |
| 43 | int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode) |
| 44 | { |
| 45 | u32 cmd, *cs; |
| 46 | int i; |
| 47 | |
| 48 | /* |
| 49 | * read/write caches: |
| 50 | * |
| 51 | * I915_GEM_DOMAIN_RENDER is always invalidated, but is |
| 52 | * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is |
| 53 | * also flushed at 2d versus 3d pipeline switches. |
| 54 | * |
| 55 | * read-only caches: |
| 56 | * |
| 57 | * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if |
| 58 | * MI_READ_FLUSH is set, and is always flushed on 965. |
| 59 | * |
| 60 | * I915_GEM_DOMAIN_COMMAND may not exist? |
| 61 | * |
| 62 | * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is |
| 63 | * invalidated when MI_EXE_FLUSH is set. |
| 64 | * |
| 65 | * I915_GEM_DOMAIN_VERTEX, which exists on 965, is |
| 66 | * invalidated with every MI_FLUSH. |
| 67 | * |
| 68 | * TLBs: |
| 69 | * |
| 70 | * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND |
| 71 | * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and |
| 72 | * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER |
| 73 | * are flushed at any MI_FLUSH. |
| 74 | */ |
| 75 | |
| 76 | cmd = MI_FLUSH; |
| 77 | if (mode & EMIT_INVALIDATE) { |
| 78 | cmd |= MI_EXE_FLUSH; |
| 79 | if (IS_G4X(rq->i915) || GRAPHICS_VER(rq->i915) == 5) |
| 80 | cmd |= MI_INVALIDATE_ISP; |
| 81 | } |
| 82 | |
| 83 | i = 2; |
| 84 | if (mode & EMIT_INVALIDATE) |
| 85 | i += 20; |
| 86 | |
| 87 | cs = intel_ring_begin(rq, num_dwords: i); |
| 88 | if (IS_ERR(ptr: cs)) |
| 89 | return PTR_ERR(ptr: cs); |
| 90 | |
| 91 | *cs++ = cmd; |
| 92 | |
| 93 | /* |
| 94 | * A random delay to let the CS invalidate take effect? Without this |
| 95 | * delay, the GPU relocation path fails as the CS does not see |
| 96 | * the updated contents. Just as important, if we apply the flushes |
| 97 | * to the EMIT_FLUSH branch (i.e. immediately after the relocation |
| 98 | * write and before the invalidate on the next batch), the relocations |
| 99 | * still fail. This implies that is a delay following invalidation |
| 100 | * that is required to reset the caches as opposed to a delay to |
| 101 | * ensure the memory is written. |
| 102 | */ |
| 103 | if (mode & EMIT_INVALIDATE) { |
| 104 | *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; |
| 105 | *cs++ = intel_gt_scratch_offset(gt: rq->engine->gt, |
| 106 | field: INTEL_GT_SCRATCH_FIELD_DEFAULT) | |
| 107 | PIPE_CONTROL_GLOBAL_GTT; |
| 108 | *cs++ = 0; |
| 109 | *cs++ = 0; |
| 110 | |
| 111 | for (i = 0; i < 12; i++) |
| 112 | *cs++ = MI_FLUSH; |
| 113 | |
| 114 | *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; |
| 115 | *cs++ = intel_gt_scratch_offset(gt: rq->engine->gt, |
| 116 | field: INTEL_GT_SCRATCH_FIELD_DEFAULT) | |
| 117 | PIPE_CONTROL_GLOBAL_GTT; |
| 118 | *cs++ = 0; |
| 119 | *cs++ = 0; |
| 120 | } |
| 121 | |
| 122 | *cs++ = cmd; |
| 123 | |
| 124 | intel_ring_advance(rq, cs); |
| 125 | |
| 126 | return 0; |
| 127 | } |
| 128 | |
| 129 | int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode) |
| 130 | { |
| 131 | u32 *cs; |
| 132 | |
| 133 | cs = intel_ring_begin(rq, num_dwords: 2); |
| 134 | if (IS_ERR(ptr: cs)) |
| 135 | return PTR_ERR(ptr: cs); |
| 136 | |
| 137 | *cs++ = MI_FLUSH; |
| 138 | *cs++ = MI_NOOP; |
| 139 | intel_ring_advance(rq, cs); |
| 140 | |
| 141 | return 0; |
| 142 | } |
| 143 | |
| 144 | static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs, |
| 145 | int flush, int post) |
| 146 | { |
| 147 | GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); |
| 148 | GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR); |
| 149 | |
| 150 | *cs++ = MI_FLUSH; |
| 151 | |
| 152 | while (flush--) { |
| 153 | *cs++ = MI_STORE_DWORD_INDEX; |
| 154 | *cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32); |
| 155 | *cs++ = rq->fence.seqno; |
| 156 | } |
| 157 | |
| 158 | while (post--) { |
| 159 | *cs++ = MI_STORE_DWORD_INDEX; |
| 160 | *cs++ = I915_GEM_HWS_SEQNO_ADDR; |
| 161 | *cs++ = rq->fence.seqno; |
| 162 | } |
| 163 | |
| 164 | *cs++ = MI_USER_INTERRUPT; |
| 165 | |
| 166 | rq->tail = intel_ring_offset(rq, addr: cs); |
| 167 | assert_ring_tail_valid(ring: rq->ring, tail: rq->tail); |
| 168 | |
| 169 | return cs; |
| 170 | } |
| 171 | |
| 172 | u32 *gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs) |
| 173 | { |
| 174 | return __gen2_emit_breadcrumb(rq, cs, flush: 16, post: 8); |
| 175 | } |
| 176 | |
| 177 | u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs) |
| 178 | { |
| 179 | return __gen2_emit_breadcrumb(rq, cs, flush: 8, post: 8); |
| 180 | } |
| 181 | |
| 182 | /* Just userspace ABI convention to limit the wa batch bo to a reasonable size */ |
| 183 | #define I830_BATCH_LIMIT SZ_256K |
| 184 | #define I830_TLB_ENTRIES (2) |
| 185 | #define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT) |
| 186 | int i830_emit_bb_start(struct i915_request *rq, |
| 187 | u64 offset, u32 len, |
| 188 | unsigned int dispatch_flags) |
| 189 | { |
| 190 | u32 *cs, cs_offset = |
| 191 | intel_gt_scratch_offset(gt: rq->engine->gt, |
| 192 | field: INTEL_GT_SCRATCH_FIELD_DEFAULT); |
| 193 | |
| 194 | GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE); |
| 195 | |
| 196 | cs = intel_ring_begin(rq, num_dwords: 6); |
| 197 | if (IS_ERR(ptr: cs)) |
| 198 | return PTR_ERR(ptr: cs); |
| 199 | |
| 200 | /* Evict the invalid PTE TLBs */ |
| 201 | *cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA; |
| 202 | *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096; |
| 203 | *cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */ |
| 204 | *cs++ = cs_offset; |
| 205 | *cs++ = 0xdeadbeef; |
| 206 | *cs++ = MI_NOOP; |
| 207 | intel_ring_advance(rq, cs); |
| 208 | |
| 209 | if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) { |
| 210 | if (len > I830_BATCH_LIMIT) |
| 211 | return -ENOSPC; |
| 212 | |
| 213 | cs = intel_ring_begin(rq, num_dwords: 6 + 2); |
| 214 | if (IS_ERR(ptr: cs)) |
| 215 | return PTR_ERR(ptr: cs); |
| 216 | |
| 217 | /* |
| 218 | * Blit the batch (which has now all relocs applied) to the |
| 219 | * stable batch scratch bo area (so that the CS never |
| 220 | * stumbles over its tlb invalidation bug) ... |
| 221 | */ |
| 222 | *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); |
| 223 | *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096; |
| 224 | *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096; |
| 225 | *cs++ = cs_offset; |
| 226 | *cs++ = 4096; |
| 227 | *cs++ = offset; |
| 228 | |
| 229 | *cs++ = MI_FLUSH; |
| 230 | *cs++ = MI_NOOP; |
| 231 | intel_ring_advance(rq, cs); |
| 232 | |
| 233 | /* ... and execute it. */ |
| 234 | offset = cs_offset; |
| 235 | } |
| 236 | |
| 237 | if (!(dispatch_flags & I915_DISPATCH_SECURE)) |
| 238 | offset |= MI_BATCH_NON_SECURE; |
| 239 | |
| 240 | cs = intel_ring_begin(rq, num_dwords: 2); |
| 241 | if (IS_ERR(ptr: cs)) |
| 242 | return PTR_ERR(ptr: cs); |
| 243 | |
| 244 | *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; |
| 245 | *cs++ = offset; |
| 246 | intel_ring_advance(rq, cs); |
| 247 | |
| 248 | return 0; |
| 249 | } |
| 250 | |
| 251 | int gen2_emit_bb_start(struct i915_request *rq, |
| 252 | u64 offset, u32 len, |
| 253 | unsigned int dispatch_flags) |
| 254 | { |
| 255 | u32 *cs; |
| 256 | |
| 257 | if (!(dispatch_flags & I915_DISPATCH_SECURE)) |
| 258 | offset |= MI_BATCH_NON_SECURE; |
| 259 | |
| 260 | cs = intel_ring_begin(rq, num_dwords: 2); |
| 261 | if (IS_ERR(ptr: cs)) |
| 262 | return PTR_ERR(ptr: cs); |
| 263 | |
| 264 | *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; |
| 265 | *cs++ = offset; |
| 266 | intel_ring_advance(rq, cs); |
| 267 | |
| 268 | return 0; |
| 269 | } |
| 270 | |
| 271 | int gen4_emit_bb_start(struct i915_request *rq, |
| 272 | u64 offset, u32 length, |
| 273 | unsigned int dispatch_flags) |
| 274 | { |
| 275 | u32 security; |
| 276 | u32 *cs; |
| 277 | |
| 278 | security = MI_BATCH_NON_SECURE_I965; |
| 279 | if (dispatch_flags & I915_DISPATCH_SECURE) |
| 280 | security = 0; |
| 281 | |
| 282 | cs = intel_ring_begin(rq, num_dwords: 2); |
| 283 | if (IS_ERR(ptr: cs)) |
| 284 | return PTR_ERR(ptr: cs); |
| 285 | |
| 286 | *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security; |
| 287 | *cs++ = offset; |
| 288 | intel_ring_advance(rq, cs); |
| 289 | |
| 290 | return 0; |
| 291 | } |
| 292 | |
| 293 | void gen2_irq_enable(struct intel_engine_cs *engine) |
| 294 | { |
| 295 | engine->i915->gen2_imr_mask &= ~engine->irq_enable_mask; |
| 296 | intel_uncore_write(uncore: engine->uncore, GEN2_IMR, val: engine->i915->gen2_imr_mask); |
| 297 | intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR); |
| 298 | } |
| 299 | |
| 300 | void gen2_irq_disable(struct intel_engine_cs *engine) |
| 301 | { |
| 302 | engine->i915->gen2_imr_mask |= engine->irq_enable_mask; |
| 303 | intel_uncore_write(uncore: engine->uncore, GEN2_IMR, val: engine->i915->gen2_imr_mask); |
| 304 | } |
| 305 | |
| 306 | void gen5_irq_enable(struct intel_engine_cs *engine) |
| 307 | { |
| 308 | gen5_gt_enable_irq(gt: engine->gt, mask: engine->irq_enable_mask); |
| 309 | } |
| 310 | |
| 311 | void gen5_irq_disable(struct intel_engine_cs *engine) |
| 312 | { |
| 313 | gen5_gt_disable_irq(gt: engine->gt, mask: engine->irq_enable_mask); |
| 314 | } |
| 315 | |