| 1 | // SPDX-License-Identifier: MIT |
| 2 | /* |
| 3 | * Copyright © 2022 Intel Corporation |
| 4 | */ |
| 5 | |
| 6 | #include "xe_exec.h" |
| 7 | |
| 8 | #include <drm/drm_device.h> |
| 9 | #include <drm/drm_exec.h> |
| 10 | #include <drm/drm_file.h> |
| 11 | #include <uapi/drm/xe_drm.h> |
| 12 | #include <linux/delay.h> |
| 13 | |
| 14 | #include "xe_bo.h" |
| 15 | #include "xe_device.h" |
| 16 | #include "xe_exec_queue.h" |
| 17 | #include "xe_hw_engine_group.h" |
| 18 | #include "xe_macros.h" |
| 19 | #include "xe_pm.h" |
| 20 | #include "xe_ring_ops_types.h" |
| 21 | #include "xe_sched_job.h" |
| 22 | #include "xe_sync.h" |
| 23 | #include "xe_svm.h" |
| 24 | #include "xe_trace.h" |
| 25 | #include "xe_vm.h" |
| 26 | |
| 27 | /** |
| 28 | * DOC: Execbuf (User GPU command submission) |
| 29 | * |
| 30 | * Execs have historically been rather complicated in DRM drivers (at least in |
| 31 | * the i915) because a few things: |
| 32 | * |
| 33 | * - Passing in a list BO which are read / written to creating implicit syncs |
| 34 | * - Binding at exec time |
| 35 | * - Flow controlling the ring at exec time |
| 36 | * |
| 37 | * In Xe we avoid all of this complication by not allowing a BO list to be |
| 38 | * passed into an exec, using the dma-buf implicit sync uAPI, have binds as |
| 39 | * separate operations, and using the DRM scheduler to flow control the ring. |
| 40 | * Let's deep dive on each of these. |
| 41 | * |
| 42 | * We can get away from a BO list by forcing the user to use in / out fences on |
| 43 | * every exec rather than the kernel tracking dependencies of BO (e.g. if the |
| 44 | * user knows an exec writes to a BO and reads from the BO in the next exec, it |
| 45 | * is the user's responsibility to pass in / out fence between the two execs). |
| 46 | * |
| 47 | * We do not allow a user to trigger a bind at exec time rather we have a VM |
| 48 | * bind IOCTL which uses the same in / out fence interface as exec. In that |
| 49 | * sense, a VM bind is basically the same operation as an exec from the user |
| 50 | * perspective. e.g. If an exec depends on a VM bind use the in / out fence |
| 51 | * interface (struct drm_xe_sync) to synchronize like syncing between two |
| 52 | * dependent execs. |
| 53 | * |
| 54 | * Although a user cannot trigger a bind, we still have to rebind userptrs in |
| 55 | * the VM that have been invalidated since the last exec, likewise we also have |
| 56 | * to rebind BOs that have been evicted by the kernel. We schedule these rebinds |
| 57 | * behind any pending kernel operations on any external BOs in VM or any BOs |
| 58 | * private to the VM. This is accomplished by the rebinds waiting on BOs |
| 59 | * DMA_RESV_USAGE_KERNEL slot (kernel ops) and kernel ops waiting on all BOs |
| 60 | * slots (inflight execs are in the DMA_RESV_USAGE_BOOKKEEP for private BOs and |
| 61 | * for external BOs). |
| 62 | * |
| 63 | * Rebinds / dma-resv usage applies to non-compute mode VMs only as for compute |
| 64 | * mode VMs we use preempt fences and a rebind worker (TODO: add link). |
| 65 | * |
| 66 | * There is no need to flow control the ring in the exec as we write the ring at |
| 67 | * submission time and set the DRM scheduler max job limit SIZE_OF_RING / |
| 68 | * MAX_JOB_SIZE. The DRM scheduler will then hold all jobs until space in the |
| 69 | * ring is available. |
| 70 | * |
| 71 | * All of this results in a rather simple exec implementation. |
| 72 | * |
| 73 | * Flow |
| 74 | * ~~~~ |
| 75 | * |
| 76 | * .. code-block:: |
| 77 | * |
| 78 | * Parse input arguments |
| 79 | * Wait for any async VM bind passed as in-fences to start |
| 80 | * <----------------------------------------------------------------------| |
| 81 | * Lock global VM lock in read mode | |
| 82 | * Pin userptrs (also finds userptr invalidated since last exec) | |
| 83 | * Lock exec (VM dma-resv lock, external BOs dma-resv locks) | |
| 84 | * Validate BOs that have been evicted | |
| 85 | * Create job | |
| 86 | * Rebind invalidated userptrs + evicted BOs (non-compute-mode) | |
| 87 | * Add rebind fence dependency to job | |
| 88 | * Add job VM dma-resv bookkeeping slot (non-compute mode) | |
| 89 | * Add job to external BOs dma-resv write slots (non-compute mode) | |
| 90 | * Check if any userptrs invalidated since pin ------ Drop locks ---------| |
| 91 | * Install in / out fences for job |
| 92 | * Submit job |
| 93 | * Unlock all |
| 94 | */ |
| 95 | |
| 96 | /* |
| 97 | * Add validation and rebinding to the drm_exec locking loop, since both can |
| 98 | * trigger eviction which may require sleeping dma_resv locks. |
| 99 | */ |
| 100 | static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec) |
| 101 | { |
| 102 | struct xe_vm *vm = container_of(vm_exec->vm, struct xe_vm, gpuvm); |
| 103 | int ret; |
| 104 | |
| 105 | /* The fence slot added here is intended for the exec sched job. */ |
| 106 | xe_vm_set_validation_exec(vm, exec: &vm_exec->exec); |
| 107 | ret = xe_vm_validate_rebind(vm, exec: &vm_exec->exec, num_fences: 1); |
| 108 | xe_vm_set_validation_exec(vm, NULL); |
| 109 | return ret; |
| 110 | } |
| 111 | |
| 112 | int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) |
| 113 | { |
| 114 | struct xe_device *xe = to_xe_device(dev); |
| 115 | struct xe_file *xef = to_xe_file(file); |
| 116 | struct drm_xe_exec *args = data; |
| 117 | struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs); |
| 118 | u64 __user *addresses_user = u64_to_user_ptr(args->address); |
| 119 | struct xe_exec_queue *q; |
| 120 | struct xe_sync_entry *syncs = NULL; |
| 121 | u64 addresses[XE_HW_ENGINE_MAX_INSTANCE]; |
| 122 | struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn}; |
| 123 | struct drm_exec *exec = &vm_exec.exec; |
| 124 | u32 i, num_syncs, num_ufence = 0; |
| 125 | struct xe_validation_ctx ctx; |
| 126 | struct xe_sched_job *job; |
| 127 | struct xe_vm *vm; |
| 128 | bool write_locked; |
| 129 | int err = 0; |
| 130 | struct xe_hw_engine_group *group; |
| 131 | enum xe_hw_engine_group_execution_mode mode, previous_mode; |
| 132 | |
| 133 | if (XE_IOCTL_DBG(xe, args->extensions) || |
| 134 | XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) || |
| 135 | XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) || |
| 136 | XE_IOCTL_DBG(xe, args->num_syncs > DRM_XE_MAX_SYNCS)) |
| 137 | return -EINVAL; |
| 138 | |
| 139 | q = xe_exec_queue_lookup(xef, id: args->exec_queue_id); |
| 140 | if (XE_IOCTL_DBG(xe, !q)) |
| 141 | return -ENOENT; |
| 142 | |
| 143 | if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_VM)) { |
| 144 | err = -EINVAL; |
| 145 | goto err_exec_queue; |
| 146 | } |
| 147 | |
| 148 | if (XE_IOCTL_DBG(xe, args->num_batch_buffer && |
| 149 | q->width != args->num_batch_buffer)) { |
| 150 | err = -EINVAL; |
| 151 | goto err_exec_queue; |
| 152 | } |
| 153 | |
| 154 | if (XE_IOCTL_DBG(xe, q->ops->reset_status(q))) { |
| 155 | err = -ECANCELED; |
| 156 | goto err_exec_queue; |
| 157 | } |
| 158 | |
| 159 | if (atomic_read(v: &q->job_cnt) >= XE_MAX_JOB_COUNT_PER_EXEC_QUEUE) { |
| 160 | trace_xe_exec_queue_reach_max_job_count(q, XE_MAX_JOB_COUNT_PER_EXEC_QUEUE); |
| 161 | err = -EAGAIN; |
| 162 | goto err_exec_queue; |
| 163 | } |
| 164 | |
| 165 | if (args->num_syncs) { |
| 166 | syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL); |
| 167 | if (!syncs) { |
| 168 | err = -ENOMEM; |
| 169 | goto err_exec_queue; |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | vm = q->vm; |
| 174 | |
| 175 | for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) { |
| 176 | err = xe_sync_entry_parse(xe, xef, sync: &syncs[num_syncs], |
| 177 | sync_user: &syncs_user[num_syncs], NULL, ufence_timeline_value: 0, |
| 178 | SYNC_PARSE_FLAG_EXEC | |
| 179 | (xe_vm_in_lr_mode(vm) ? |
| 180 | SYNC_PARSE_FLAG_LR_MODE : 0)); |
| 181 | if (err) |
| 182 | goto err_syncs; |
| 183 | |
| 184 | if (xe_sync_is_ufence(sync: &syncs[num_syncs])) |
| 185 | num_ufence++; |
| 186 | } |
| 187 | |
| 188 | if (XE_IOCTL_DBG(xe, num_ufence > 1)) { |
| 189 | err = -EINVAL; |
| 190 | goto err_syncs; |
| 191 | } |
| 192 | |
| 193 | if (args->num_batch_buffer && xe_exec_queue_is_parallel(q)) { |
| 194 | err = copy_from_user(to: addresses, from: addresses_user, |
| 195 | n: sizeof(u64) * q->width); |
| 196 | if (err) { |
| 197 | err = -EFAULT; |
| 198 | goto err_syncs; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | group = q->hwe->hw_engine_group; |
| 203 | mode = xe_hw_engine_group_find_exec_mode(q); |
| 204 | |
| 205 | if (mode == EXEC_MODE_DMA_FENCE) { |
| 206 | err = xe_hw_engine_group_get_mode(group, new_mode: mode, previous_mode: &previous_mode); |
| 207 | if (err) |
| 208 | goto err_syncs; |
| 209 | } |
| 210 | |
| 211 | retry: |
| 212 | if (!xe_vm_in_lr_mode(vm) && xe_vm_userptr_check_repin(vm)) { |
| 213 | err = down_write_killable(sem: &vm->lock); |
| 214 | write_locked = true; |
| 215 | } else { |
| 216 | /* We don't allow execs while the VM is in error state */ |
| 217 | err = down_read_interruptible(sem: &vm->lock); |
| 218 | write_locked = false; |
| 219 | } |
| 220 | if (err) |
| 221 | goto err_hw_exec_mode; |
| 222 | |
| 223 | if (write_locked) { |
| 224 | err = xe_vm_userptr_pin(vm); |
| 225 | downgrade_write(sem: &vm->lock); |
| 226 | write_locked = false; |
| 227 | if (err) |
| 228 | goto err_unlock_list; |
| 229 | } |
| 230 | |
| 231 | if (!args->num_batch_buffer) { |
| 232 | err = xe_vm_lock(vm, intr: true); |
| 233 | if (err) |
| 234 | goto err_unlock_list; |
| 235 | |
| 236 | if (!xe_vm_in_lr_mode(vm)) { |
| 237 | struct dma_fence *fence; |
| 238 | |
| 239 | fence = xe_sync_in_fence_get(sync: syncs, num_sync: num_syncs, q, vm); |
| 240 | if (IS_ERR(ptr: fence)) { |
| 241 | err = PTR_ERR(ptr: fence); |
| 242 | xe_vm_unlock(vm); |
| 243 | goto err_unlock_list; |
| 244 | } |
| 245 | for (i = 0; i < num_syncs; i++) |
| 246 | xe_sync_entry_signal(sync: &syncs[i], fence); |
| 247 | xe_exec_queue_last_fence_set(e: q, vm, fence); |
| 248 | dma_fence_put(fence); |
| 249 | } |
| 250 | |
| 251 | xe_vm_unlock(vm); |
| 252 | goto err_unlock_list; |
| 253 | } |
| 254 | |
| 255 | /* |
| 256 | * It's OK to block interruptible here with the vm lock held, since |
| 257 | * on task freezing during suspend / hibernate, the call will |
| 258 | * return -ERESTARTSYS and the IOCTL will be rerun. |
| 259 | */ |
| 260 | err = xe_pm_block_on_suspend(xe); |
| 261 | if (err) |
| 262 | goto err_unlock_list; |
| 263 | |
| 264 | if (!xe_vm_in_lr_mode(vm)) { |
| 265 | vm_exec.vm = &vm->gpuvm; |
| 266 | vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; |
| 267 | err = xe_validation_exec_lock(ctx: &ctx, vm_exec: &vm_exec, val: &xe->val); |
| 268 | if (err) |
| 269 | goto err_unlock_list; |
| 270 | } |
| 271 | |
| 272 | if (xe_vm_is_closed_or_banned(vm: q->vm)) { |
| 273 | drm_warn(&xe->drm, "Trying to schedule after vm is closed or banned\n" ); |
| 274 | err = -ECANCELED; |
| 275 | goto err_exec; |
| 276 | } |
| 277 | |
| 278 | if (xe_exec_queue_uses_pxp(q)) { |
| 279 | err = xe_vm_validate_protected(vm: q->vm); |
| 280 | if (err) |
| 281 | goto err_exec; |
| 282 | } |
| 283 | |
| 284 | job = xe_sched_job_create(q, batch_addr: xe_exec_queue_is_parallel(q) ? |
| 285 | addresses : &args->address); |
| 286 | if (IS_ERR(ptr: job)) { |
| 287 | err = PTR_ERR(ptr: job); |
| 288 | goto err_exec; |
| 289 | } |
| 290 | |
| 291 | /* Wait behind rebinds */ |
| 292 | if (!xe_vm_in_lr_mode(vm)) { |
| 293 | err = xe_sched_job_add_deps(job, |
| 294 | resv: xe_vm_resv(vm), |
| 295 | usage: DMA_RESV_USAGE_KERNEL); |
| 296 | if (err) |
| 297 | goto err_put_job; |
| 298 | } |
| 299 | |
| 300 | for (i = 0; i < num_syncs && !err; i++) |
| 301 | err = xe_sync_entry_add_deps(sync: &syncs[i], job); |
| 302 | if (err) |
| 303 | goto err_put_job; |
| 304 | |
| 305 | if (!xe_vm_in_lr_mode(vm)) { |
| 306 | err = xe_svm_notifier_lock_interruptible(vm); |
| 307 | if (err) |
| 308 | goto err_put_job; |
| 309 | |
| 310 | err = __xe_vm_userptr_needs_repin(vm); |
| 311 | if (err) |
| 312 | goto err_repin; |
| 313 | } |
| 314 | |
| 315 | /* |
| 316 | * Point of no return, if we error after this point just set an error on |
| 317 | * the job and let the DRM scheduler / backend clean up the job. |
| 318 | */ |
| 319 | xe_sched_job_arm(job); |
| 320 | if (!xe_vm_in_lr_mode(vm)) |
| 321 | drm_gpuvm_resv_add_fence(gpuvm: &vm->gpuvm, exec, fence: &job->drm.s_fence->finished, |
| 322 | private_usage: DMA_RESV_USAGE_BOOKKEEP, |
| 323 | extobj_usage: DMA_RESV_USAGE_BOOKKEEP); |
| 324 | |
| 325 | for (i = 0; i < num_syncs; i++) { |
| 326 | xe_sync_entry_signal(sync: &syncs[i], fence: &job->drm.s_fence->finished); |
| 327 | xe_sched_job_init_user_fence(job, sync: &syncs[i]); |
| 328 | } |
| 329 | |
| 330 | if (!xe_vm_in_lr_mode(vm)) |
| 331 | xe_exec_queue_last_fence_set(e: q, vm, fence: &job->drm.s_fence->finished); |
| 332 | xe_sched_job_push(job); |
| 333 | xe_vm_reactivate_rebind(vm); |
| 334 | |
| 335 | if (!err && !xe_vm_in_lr_mode(vm)) { |
| 336 | spin_lock(lock: &xe->ttm.lru_lock); |
| 337 | ttm_lru_bulk_move_tail(bulk: &vm->lru_bulk_move); |
| 338 | spin_unlock(lock: &xe->ttm.lru_lock); |
| 339 | } |
| 340 | |
| 341 | if (mode == EXEC_MODE_LR) |
| 342 | xe_hw_engine_group_resume_faulting_lr_jobs(group); |
| 343 | |
| 344 | err_repin: |
| 345 | if (!xe_vm_in_lr_mode(vm)) |
| 346 | xe_svm_notifier_unlock(vm); |
| 347 | err_put_job: |
| 348 | if (err) |
| 349 | xe_sched_job_put(job); |
| 350 | err_exec: |
| 351 | if (!xe_vm_in_lr_mode(vm)) |
| 352 | xe_validation_ctx_fini(ctx: &ctx); |
| 353 | err_unlock_list: |
| 354 | up_read(sem: &vm->lock); |
| 355 | if (err == -EAGAIN) |
| 356 | goto retry; |
| 357 | err_hw_exec_mode: |
| 358 | if (mode == EXEC_MODE_DMA_FENCE) |
| 359 | xe_hw_engine_group_put(group); |
| 360 | err_syncs: |
| 361 | while (num_syncs--) |
| 362 | xe_sync_entry_cleanup(sync: &syncs[num_syncs]); |
| 363 | kfree(objp: syncs); |
| 364 | err_exec_queue: |
| 365 | xe_exec_queue_put(q); |
| 366 | |
| 367 | return err; |
| 368 | } |
| 369 | |