| 1 | // SPDX-License-Identifier: MIT |
| 2 | /* |
| 3 | * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. |
| 4 | * |
| 5 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 6 | * of this software and associated documentation files (the "Software"), to deal |
| 7 | * in the Software without restriction, including without limitation the rights |
| 8 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 9 | * copies of the Software, and to permit persons to whom the Software is |
| 10 | * furnished to do so, subject to the following conditions: |
| 11 | * |
| 12 | * The above copyright notice and this permission notice shall be included in |
| 13 | * all copies or substantial portions of the Software. |
| 14 | * |
| 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 18 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 21 | * THE SOFTWARE. |
| 22 | */ |
| 23 | |
| 24 | #include "amdgpu.h" |
| 25 | #include "amdgpu_reset.h" |
| 26 | #include "amdgpu_xgmi.h" |
| 27 | #include "ras_sys.h" |
| 28 | #include "amdgpu_ras_mgr.h" |
| 29 | #include "amdgpu_ras_process.h" |
| 30 | |
| 31 | #define RAS_MGR_RETIRE_PAGE_INTERVAL 100 |
| 32 | #define RAS_EVENT_PROCESS_TIMEOUT 1200 |
| 33 | |
| 34 | static void ras_process_retire_page_dwork(struct work_struct *work) |
| 35 | { |
| 36 | struct amdgpu_ras_mgr *ras_mgr = |
| 37 | container_of(work, struct amdgpu_ras_mgr, retire_page_dwork.work); |
| 38 | struct amdgpu_device *adev = ras_mgr->adev; |
| 39 | int ret; |
| 40 | |
| 41 | if (amdgpu_ras_is_rma(adev)) |
| 42 | return; |
| 43 | |
| 44 | /* If gpu reset is ongoing, delay retiring the bad pages */ |
| 45 | if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { |
| 46 | schedule_delayed_work(dwork: &ras_mgr->retire_page_dwork, |
| 47 | delay: msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL * 3)); |
| 48 | return; |
| 49 | } |
| 50 | |
| 51 | ret = ras_umc_handle_bad_pages(ras_core: ras_mgr->ras_core, NULL); |
| 52 | if (!ret) |
| 53 | schedule_delayed_work(dwork: &ras_mgr->retire_page_dwork, |
| 54 | delay: msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL)); |
| 55 | } |
| 56 | |
| 57 | int amdgpu_ras_process_init(struct amdgpu_device *adev) |
| 58 | { |
| 59 | struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); |
| 60 | |
| 61 | ras_mgr->is_paused = false; |
| 62 | init_completion(x: &ras_mgr->ras_event_done); |
| 63 | |
| 64 | INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork); |
| 65 | |
| 66 | return 0; |
| 67 | } |
| 68 | |
| 69 | int amdgpu_ras_process_fini(struct amdgpu_device *adev) |
| 70 | { |
| 71 | struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); |
| 72 | |
| 73 | ras_mgr->is_paused = false; |
| 74 | /* Save all cached bad pages to eeprom */ |
| 75 | flush_delayed_work(dwork: &ras_mgr->retire_page_dwork); |
| 76 | cancel_delayed_work_sync(dwork: &ras_mgr->retire_page_dwork); |
| 77 | return 0; |
| 78 | } |
| 79 | |
| 80 | int amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device *adev, void *data) |
| 81 | { |
| 82 | struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); |
| 83 | |
| 84 | if (!ras_mgr->ras_core) |
| 85 | return -EINVAL; |
| 86 | |
| 87 | return ras_process_add_interrupt_req(ras_core: ras_mgr->ras_core, NULL, is_umc: true); |
| 88 | } |
| 89 | |
| 90 | int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, void *data) |
| 91 | { |
| 92 | amdgpu_ras_set_fed(adev, status: true); |
| 93 | return amdgpu_ras_mgr_reset_gpu(adev, AMDGPU_RAS_GPU_RESET_MODE1_RESET); |
| 94 | } |
| 95 | |
| 96 | int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, void *data) |
| 97 | { |
| 98 | struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); |
| 99 | struct ras_ih_info *ih_info = (struct ras_ih_info *)data; |
| 100 | struct ras_event_req req; |
| 101 | uint64_t seqno; |
| 102 | |
| 103 | if (!ih_info) |
| 104 | return -EINVAL; |
| 105 | |
| 106 | memset(&req, 0, sizeof(req)); |
| 107 | req.block = ih_info->block; |
| 108 | req.data = ih_info->data; |
| 109 | req.pasid = ih_info->pasid; |
| 110 | req.pasid_fn = ih_info->pasid_fn; |
| 111 | req.reset = ih_info->reset; |
| 112 | |
| 113 | seqno = ras_core_get_seqno(ras_core: ras_mgr->ras_core, |
| 114 | seqno_type: RAS_SEQNO_TYPE_POISON_CONSUMPTION, pop: false); |
| 115 | |
| 116 | /* When the ACA register cannot be read from FW, the poison |
| 117 | * consumption seqno in the fifo will not pop up, so it is |
| 118 | * necessary to check whether the seqno is the previous seqno. |
| 119 | */ |
| 120 | if (seqno == ras_mgr->last_poison_consumption_seqno) { |
| 121 | /* Pop and discard the previous seqno */ |
| 122 | ras_core_get_seqno(ras_core: ras_mgr->ras_core, |
| 123 | seqno_type: RAS_SEQNO_TYPE_POISON_CONSUMPTION, pop: true); |
| 124 | seqno = ras_core_get_seqno(ras_core: ras_mgr->ras_core, |
| 125 | seqno_type: RAS_SEQNO_TYPE_POISON_CONSUMPTION, pop: false); |
| 126 | } |
| 127 | ras_mgr->last_poison_consumption_seqno = seqno; |
| 128 | req.seqno = seqno; |
| 129 | |
| 130 | return ras_process_add_interrupt_req(ras_core: ras_mgr->ras_core, req: &req, is_umc: false); |
| 131 | } |
| 132 | |
| 133 | int amdgpu_ras_process_begin(struct amdgpu_device *adev) |
| 134 | { |
| 135 | struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); |
| 136 | |
| 137 | if (ras_mgr->is_paused) |
| 138 | return -EAGAIN; |
| 139 | |
| 140 | reinit_completion(x: &ras_mgr->ras_event_done); |
| 141 | return 0; |
| 142 | } |
| 143 | |
| 144 | int amdgpu_ras_process_end(struct amdgpu_device *adev) |
| 145 | { |
| 146 | struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); |
| 147 | |
| 148 | complete(&ras_mgr->ras_event_done); |
| 149 | return 0; |
| 150 | } |
| 151 | |
| 152 | int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev) |
| 153 | { |
| 154 | struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); |
| 155 | long rc; |
| 156 | |
| 157 | if (!ras_mgr || !ras_mgr->ras_core) |
| 158 | return -EINVAL; |
| 159 | |
| 160 | if (!ras_mgr->ras_core->is_initialized) |
| 161 | return -EPERM; |
| 162 | |
| 163 | ras_mgr->is_paused = true; |
| 164 | |
| 165 | /* Wait for RAS event processing to complete */ |
| 166 | rc = wait_for_completion_interruptible_timeout(x: &ras_mgr->ras_event_done, |
| 167 | timeout: msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT)); |
| 168 | if (rc <= 0) |
| 169 | RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n" , |
| 170 | rc ? "interrupted" : "timeout" ); |
| 171 | |
| 172 | flush_delayed_work(dwork: &ras_mgr->retire_page_dwork); |
| 173 | return 0; |
| 174 | } |
| 175 | |
| 176 | int amdgpu_ras_process_post_reset(struct amdgpu_device *adev) |
| 177 | { |
| 178 | struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); |
| 179 | |
| 180 | if (!ras_mgr || !ras_mgr->ras_core) |
| 181 | return -EINVAL; |
| 182 | |
| 183 | if (!ras_mgr->ras_core->is_initialized) |
| 184 | return -EPERM; |
| 185 | |
| 186 | ras_mgr->is_paused = false; |
| 187 | |
| 188 | schedule_delayed_work(dwork: &ras_mgr->retire_page_dwork, delay: 0); |
| 189 | return 0; |
| 190 | } |
| 191 | |