1// SPDX-License-Identifier: MIT
2/*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24#include "ras.h"
25#include "ras_process.h"
26
27#define RAS_EVENT_FIFO_SIZE (128 * sizeof(struct ras_event_req))
28
29#define RAS_POLLING_ECC_TIMEOUT 300
30
31static int ras_process_put_event(struct ras_core_context *ras_core,
32 struct ras_event_req *req)
33{
34 struct ras_process *ras_proc = &ras_core->ras_proc;
35 int ret;
36
37 ret = kfifo_in_spinlocked(&ras_proc->event_fifo,
38 req, sizeof(*req), &ras_proc->fifo_spinlock);
39 if (!ret) {
40 RAS_DEV_ERR(ras_core->dev, "Poison message fifo is full!\n");
41 return -ENOSPC;
42 }
43
44 return 0;
45}
46
47static int ras_process_add_reset_gpu_event(struct ras_core_context *ras_core,
48 uint32_t reset_cause)
49{
50 struct ras_event_req req = {0};
51
52 req.reset = reset_cause;
53
54 return ras_process_put_event(ras_core, req: &req);
55}
56
57static int ras_process_get_event(struct ras_core_context *ras_core,
58 struct ras_event_req *req)
59{
60 struct ras_process *ras_proc = &ras_core->ras_proc;
61
62 return kfifo_out_spinlocked(&ras_proc->event_fifo,
63 req, sizeof(*req), &ras_proc->fifo_spinlock);
64}
65
66static void ras_process_clear_event_fifo(struct ras_core_context *ras_core)
67{
68 struct ras_event_req req;
69 int ret;
70
71 do {
72 ret = ras_process_get_event(ras_core, req: &req);
73 } while (ret);
74}
75
76#define AMDGPU_RAS_WAITING_DATA_READY 200
77static int ras_process_umc_event(struct ras_core_context *ras_core,
78 uint32_t event_count)
79{
80 struct ras_ecc_count ecc_data;
81 int ret = 0;
82 uint32_t timeout = 0;
83 uint32_t detected_de_count = 0;
84
85 do {
86 memset(&ecc_data, 0, sizeof(ecc_data));
87 ret = ras_core_update_ecc_info(ras_core);
88 if (ret)
89 return ret;
90
91 ret = ras_core_query_block_ecc_data(ras_core, block: RAS_BLOCK_ID__UMC, ecc_count: &ecc_data);
92 if (ret)
93 return ret;
94
95 if (ecc_data.new_de_count) {
96 detected_de_count += ecc_data.new_de_count;
97 timeout = 0;
98 } else {
99 if (!timeout && event_count)
100 timeout = AMDGPU_RAS_WAITING_DATA_READY;
101
102 if (timeout) {
103 if (!--timeout)
104 break;
105
106 msleep(msecs: 1);
107 }
108 }
109 } while (detected_de_count < event_count);
110
111 if (detected_de_count && ras_core_gpu_is_rma(ras_core))
112 ras_process_add_reset_gpu_event(ras_core, GPU_RESET_CAUSE_RMA);
113
114 return 0;
115}
116
117static int ras_process_non_umc_event(struct ras_core_context *ras_core)
118{
119 struct ras_process *ras_proc = &ras_core->ras_proc;
120 struct ras_event_req req;
121 uint32_t event_count = kfifo_len(&ras_proc->event_fifo);
122 uint32_t reset_flags = 0;
123 int ret = 0, i;
124
125 for (i = 0; i < event_count; i++) {
126 memset(&req, 0, sizeof(req));
127 ret = ras_process_get_event(ras_core, req: &req);
128 if (!ret)
129 continue;
130
131 ras_core_event_notify(ras_core,
132 event_id: RAS_EVENT_ID__POISON_CONSUMPTION, data: &req);
133
134 reset_flags |= req.reset;
135
136 if (req.reset == GPU_RESET_CAUSE_RMA)
137 continue;
138
139 if (req.reset)
140 RAS_DEV_INFO(ras_core->dev,
141 "{%llu} GPU reset for %s RAS poison consumption is issued!\n",
142 req.seqno, ras_core_get_ras_block_name(req.block));
143 else
144 RAS_DEV_INFO(ras_core->dev,
145 "{%llu} %s RAS poison consumption is issued!\n",
146 req.seqno, ras_core_get_ras_block_name(req.block));
147 }
148
149 if (reset_flags) {
150 ret = ras_core_event_notify(ras_core,
151 event_id: RAS_EVENT_ID__RESET_GPU, data: &reset_flags);
152 if (!ret && (reset_flags & GPU_RESET_CAUSE_RMA))
153 return -RAS_CORE_GPU_IN_MODE1_RESET;
154 }
155
156 return ret;
157}
158
159int ras_process_handle_ras_event(struct ras_core_context *ras_core)
160{
161 struct ras_process *ras_proc = &ras_core->ras_proc;
162 uint32_t umc_event_count;
163 int ret;
164
165 ret = ras_core_event_notify(ras_core,
166 event_id: RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL);
167 if (ret)
168 return ret;
169
170 ras_aca_clear_fatal_flag(ras_core);
171 ras_umc_log_pending_bad_bank(ras_core);
172
173 do {
174 umc_event_count = atomic_read(v: &ras_proc->umc_interrupt_count);
175 ret = ras_process_umc_event(ras_core, event_count: umc_event_count);
176 if (ret == -RAS_CORE_GPU_IN_MODE1_RESET)
177 break;
178
179 if (umc_event_count)
180 atomic_sub(i: umc_event_count, v: &ras_proc->umc_interrupt_count);
181 } while (atomic_read(v: &ras_proc->umc_interrupt_count));
182
183 if ((ret != -RAS_CORE_GPU_IN_MODE1_RESET) &&
184 (kfifo_len(&ras_proc->event_fifo)))
185 ret = ras_process_non_umc_event(ras_core);
186
187 if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) {
188 /* Clear poison fifo */
189 ras_process_clear_event_fifo(ras_core);
190 atomic_set(v: &ras_proc->umc_interrupt_count, i: 0);
191 }
192
193 ras_core_event_notify(ras_core,
194 event_id: RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL);
195 return ret;
196}
197
198static int thread_wait_condition(void *param)
199{
200 struct ras_process *ras_proc = (struct ras_process *)param;
201
202 return (kthread_should_stop() ||
203 atomic_read(v: &ras_proc->ras_interrupt_req));
204}
205
206static int ras_process_thread(void *context)
207{
208 struct ras_core_context *ras_core = (struct ras_core_context *)context;
209 struct ras_process *ras_proc = &ras_core->ras_proc;
210
211 while (!kthread_should_stop()) {
212 ras_wait_event_interruptible_timeout(wq_head: &ras_proc->ras_process_wq,
213 condition: thread_wait_condition, param: ras_proc,
214 timeout: msecs_to_jiffies(RAS_POLLING_ECC_TIMEOUT));
215
216 if (kthread_should_stop())
217 break;
218
219 if (!ras_core->is_initialized)
220 continue;
221
222 atomic_set(v: &ras_proc->ras_interrupt_req, i: 0);
223
224 if (ras_core_gpu_in_reset(ras_core))
225 continue;
226
227 if (ras_core->sys_fn && ras_core->sys_fn->async_handle_ras_event)
228 ras_core->sys_fn->async_handle_ras_event(ras_core, NULL);
229 else
230 ras_process_handle_ras_event(ras_core);
231 }
232
233 return 0;
234}
235
236int ras_process_init(struct ras_core_context *ras_core)
237{
238 struct ras_process *ras_proc = &ras_core->ras_proc;
239 int ret;
240
241 ret = kfifo_alloc(&ras_proc->event_fifo, RAS_EVENT_FIFO_SIZE, GFP_KERNEL);
242 if (ret)
243 return ret;
244
245 spin_lock_init(&ras_proc->fifo_spinlock);
246
247 init_waitqueue_head(&ras_proc->ras_process_wq);
248
249 ras_proc->ras_process_thread = kthread_run(ras_process_thread,
250 (void *)ras_core, "ras_process_thread");
251 if (!ras_proc->ras_process_thread) {
252 RAS_DEV_ERR(ras_core->dev, "Failed to create ras_process_thread.\n");
253 ret = -ENOMEM;
254 goto err;
255 }
256
257 return 0;
258
259err:
260 ras_process_fini(ras_core);
261 return ret;
262}
263
264int ras_process_fini(struct ras_core_context *ras_core)
265{
266 struct ras_process *ras_proc = &ras_core->ras_proc;
267
268 if (ras_proc->ras_process_thread) {
269 kthread_stop(k: ras_proc->ras_process_thread);
270 ras_proc->ras_process_thread = NULL;
271 }
272
273 kfifo_free(&ras_proc->event_fifo);
274
275 return 0;
276}
277
278static int ras_process_add_umc_interrupt_req(struct ras_core_context *ras_core,
279 struct ras_event_req *req)
280{
281 struct ras_process *ras_proc = &ras_core->ras_proc;
282
283 atomic_inc(v: &ras_proc->umc_interrupt_count);
284 atomic_inc(v: &ras_proc->ras_interrupt_req);
285
286 wake_up(&ras_proc->ras_process_wq);
287 return 0;
288}
289
290static int ras_process_add_non_umc_interrupt_req(struct ras_core_context *ras_core,
291 struct ras_event_req *req)
292{
293 struct ras_process *ras_proc = &ras_core->ras_proc;
294 int ret;
295
296 ret = ras_process_put_event(ras_core, req);
297 if (!ret) {
298 atomic_inc(v: &ras_proc->ras_interrupt_req);
299 wake_up(&ras_proc->ras_process_wq);
300 }
301
302 return ret;
303}
304
305int ras_process_add_interrupt_req(struct ras_core_context *ras_core,
306 struct ras_event_req *req, bool is_umc)
307{
308 int ret;
309
310 if (!ras_core)
311 return -EINVAL;
312
313 if (!ras_core->is_initialized)
314 return -EPERM;
315
316 if (is_umc)
317 ret = ras_process_add_umc_interrupt_req(ras_core, req);
318 else
319 ret = ras_process_add_non_umc_interrupt_req(ras_core, req);
320
321 return ret;
322}
323

source code of linux/drivers/gpu/drm/amd/ras/rascore/ras_process.c