1// SPDX-License-Identifier: MIT
2/*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24#include "amdgpu.h"
25#include "amdgpu_reset.h"
26#include "amdgpu_xgmi.h"
27#include "ras_sys.h"
28#include "amdgpu_ras_mgr.h"
29#include "amdgpu_ras_cmd.h"
30#include "amdgpu_ras_process.h"
31#include "amdgpu_ras_eeprom_i2c.h"
32#include "amdgpu_ras_mp1_v13_0.h"
33#include "amdgpu_ras_nbio_v7_9.h"
34
35#define MAX_SOCKET_NUM_PER_HIVE 8
36#define MAX_AID_NUM_PER_SOCKET 4
37#define MAX_XCD_NUM_PER_AID 2
38
39/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
40#define TYPICAL_ECC_BAD_PAGE_RATE (100ULL * SZ_1M)
41
42#define COUNT_BAD_PAGE_THRESHOLD(size) (((size) >> 21) << 4)
43
44/* Reserve 8 physical dram row for possible retirement.
45 * In worst cases, it will lose 8 * 2MB memory in vram domain
46 */
47#define RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20)
48
49
50static void ras_mgr_init_event_mgr(struct ras_event_manager *mgr)
51{
52 struct ras_event_state *event_state;
53 int i;
54
55 memset(mgr, 0, sizeof(*mgr));
56 atomic64_set(v: &mgr->seqno, i: 0);
57
58 for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
59 event_state = &mgr->event_state[i];
60 event_state->last_seqno = RAS_EVENT_INVALID_ID;
61 atomic64_set(v: &event_state->count, i: 0);
62 }
63}
64
65static void amdgpu_ras_mgr_init_event_mgr(struct ras_core_context *ras_core)
66{
67 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
68 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
69 struct ras_event_manager *event_mgr;
70 struct amdgpu_hive_info *hive;
71
72 hive = amdgpu_get_xgmi_hive(adev);
73 event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
74
75 /* init event manager with node 0 on xgmi system */
76 if (!amdgpu_reset_in_recovery(adev)) {
77 if (!hive || adev->gmc.xgmi.node_id == 0)
78 ras_mgr_init_event_mgr(mgr: event_mgr);
79 }
80
81 if (hive)
82 amdgpu_put_xgmi_hive(hive);
83}
84
85static int amdgpu_ras_mgr_init_aca_config(struct amdgpu_device *adev,
86 struct ras_core_config *config)
87{
88 struct ras_aca_config *aca_cfg = &config->aca_cfg;
89
90 aca_cfg->socket_num_per_hive = MAX_SOCKET_NUM_PER_HIVE;
91 aca_cfg->aid_num_per_socket = MAX_AID_NUM_PER_SOCKET;
92 aca_cfg->xcd_num_per_aid = MAX_XCD_NUM_PER_AID;
93
94 return 0;
95}
96
97static int amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device *adev,
98 struct ras_core_config *config)
99{
100 struct ras_eeprom_config *eeprom_cfg = &config->eeprom_cfg;
101
102 eeprom_cfg->eeprom_sys_fn = &amdgpu_ras_eeprom_i2c_sys_func;
103 eeprom_cfg->eeprom_i2c_adapter = adev->pm.ras_eeprom_i2c_bus;
104 if (eeprom_cfg->eeprom_i2c_adapter) {
105 const struct i2c_adapter_quirks *quirks =
106 ((struct i2c_adapter *)eeprom_cfg->eeprom_i2c_adapter)->quirks;
107
108 if (quirks) {
109 eeprom_cfg->max_i2c_read_len = quirks->max_read_len;
110 eeprom_cfg->max_i2c_write_len = quirks->max_write_len;
111 }
112 }
113
114 /*
115 * amdgpu_bad_page_threshold is used to config
116 * the threshold for the number of bad pages.
117 * -1: Threshold is set to default value
118 * Driver will issue a warning message when threshold is reached
119 * and continue runtime services.
120 * 0: Disable bad page retirement
121 * Driver will not retire bad pages
122 * which is intended for debugging purpose.
123 * -2: Threshold is determined by a formula
124 * that assumes 1 bad page per 100M of local memory.
125 * Driver will continue runtime services when threhold is reached.
126 * 0 < threshold < max number of bad page records in EEPROM,
127 * A user-defined threshold is set
128 * Driver will halt runtime services when this custom threshold is reached.
129 */
130 if (amdgpu_bad_page_threshold == NONSTOP_OVER_THRESHOLD)
131 eeprom_cfg->eeprom_record_threshold_count =
132 div64_u64(dividend: adev->gmc.mc_vram_size, TYPICAL_ECC_BAD_PAGE_RATE);
133 else if (amdgpu_bad_page_threshold == WARN_NONSTOP_OVER_THRESHOLD)
134 eeprom_cfg->eeprom_record_threshold_count =
135 COUNT_BAD_PAGE_THRESHOLD(RAS_RESERVED_VRAM_SIZE_DEFAULT);
136 else
137 eeprom_cfg->eeprom_record_threshold_count = amdgpu_bad_page_threshold;
138
139 eeprom_cfg->eeprom_record_threshold_config = amdgpu_bad_page_threshold;
140
141 return 0;
142}
143
144static int amdgpu_ras_mgr_init_mp1_config(struct amdgpu_device *adev,
145 struct ras_core_config *config)
146{
147 struct ras_mp1_config *mp1_cfg = &config->mp1_cfg;
148 int ret = 0;
149
150 switch (config->mp1_ip_version) {
151 case IP_VERSION(13, 0, 6):
152 case IP_VERSION(13, 0, 14):
153 case IP_VERSION(13, 0, 12):
154 mp1_cfg->mp1_sys_fn = &amdgpu_ras_mp1_sys_func_v13_0;
155 break;
156 default:
157 RAS_DEV_ERR(adev,
158 "The mp1(0x%x) ras config is not right!\n",
159 config->mp1_ip_version);
160 ret = -EINVAL;
161 break;
162 }
163
164 return ret;
165}
166
167static int amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device *adev,
168 struct ras_core_config *config)
169{
170 struct ras_nbio_config *nbio_cfg = &config->nbio_cfg;
171 int ret = 0;
172
173 switch (config->nbio_ip_version) {
174 case IP_VERSION(7, 9, 0):
175 case IP_VERSION(7, 9, 1):
176 nbio_cfg->nbio_sys_fn = &amdgpu_ras_nbio_sys_func_v7_9;
177 break;
178 default:
179 RAS_DEV_ERR(adev,
180 "The nbio(0x%x) ras config is not right!\n",
181 config->nbio_ip_version);
182 ret = -EINVAL;
183 break;
184 }
185
186 return ret;
187}
188
189static int amdgpu_ras_mgr_get_ras_psp_system_status(struct ras_core_context *ras_core,
190 struct ras_psp_sys_status *status)
191{
192 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
193 struct ta_context *context = &adev->psp.ras_context.context;
194
195 status->initialized = context->initialized;
196 status->session_id = context->session_id;
197 status->psp_cmd_mutex = &adev->psp.mutex;
198
199 return 0;
200}
201
202static int amdgpu_ras_mgr_get_ras_ta_init_param(struct ras_core_context *ras_core,
203 struct ras_ta_init_param *ras_ta_param)
204{
205 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
206 uint32_t nps_mode;
207
208 if (amdgpu_ras_is_poison_mode_supported(adev))
209 ras_ta_param->poison_mode_en = 1;
210
211 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
212 ras_ta_param->dgpu_mode = 1;
213
214 ras_ta_param->xcc_mask = adev->gfx.xcc_mask;
215 ras_ta_param->channel_dis_num = hweight32(adev->gmc.m_half_use) * 2;
216
217 ras_ta_param->active_umc_mask = adev->umc.active_mask;
218
219 if (!amdgpu_ras_mgr_get_curr_nps_mode(adev, nps_mode: &nps_mode))
220 ras_ta_param->nps_mode = nps_mode;
221
222 return 0;
223}
224
225const struct ras_psp_sys_func amdgpu_ras_psp_sys_func = {
226 .get_ras_psp_system_status = amdgpu_ras_mgr_get_ras_psp_system_status,
227 .get_ras_ta_init_param = amdgpu_ras_mgr_get_ras_ta_init_param,
228};
229
230static int amdgpu_ras_mgr_init_psp_config(struct amdgpu_device *adev,
231 struct ras_core_config *config)
232{
233 struct ras_psp_config *psp_cfg = &config->psp_cfg;
234
235 psp_cfg->psp_sys_fn = &amdgpu_ras_psp_sys_func;
236
237 return 0;
238}
239
240static int amdgpu_ras_mgr_init_umc_config(struct amdgpu_device *adev,
241 struct ras_core_config *config)
242{
243 struct ras_umc_config *umc_cfg = &config->umc_cfg;
244
245 umc_cfg->umc_vram_type = adev->gmc.vram_type;
246
247 return 0;
248}
249
250static struct ras_core_context *amdgpu_ras_mgr_create_ras_core(struct amdgpu_device *adev)
251{
252 struct ras_core_config init_config;
253
254 memset(&init_config, 0, sizeof(init_config));
255
256 init_config.umc_ip_version = amdgpu_ip_version(adev, ip: UMC_HWIP, inst: 0);
257 init_config.mp1_ip_version = amdgpu_ip_version(adev, ip: MP1_HWIP, inst: 0);
258 init_config.gfx_ip_version = amdgpu_ip_version(adev, ip: GC_HWIP, inst: 0);
259 init_config.nbio_ip_version = amdgpu_ip_version(adev, ip: NBIO_HWIP, inst: 0);
260 init_config.psp_ip_version = amdgpu_ip_version(adev, ip: MP1_HWIP, inst: 0);
261
262 if (init_config.umc_ip_version == IP_VERSION(12, 0, 0) ||
263 init_config.umc_ip_version == IP_VERSION(12, 5, 0))
264 init_config.aca_ip_version = IP_VERSION(1, 0, 0);
265
266 init_config.sys_fn = &amdgpu_ras_sys_fn;
267 init_config.ras_eeprom_supported = true;
268 init_config.poison_supported =
269 amdgpu_ras_is_poison_mode_supported(adev);
270
271 amdgpu_ras_mgr_init_aca_config(adev, config: &init_config);
272 amdgpu_ras_mgr_init_eeprom_config(adev, config: &init_config);
273 amdgpu_ras_mgr_init_mp1_config(adev, config: &init_config);
274 amdgpu_ras_mgr_init_nbio_config(adev, config: &init_config);
275 amdgpu_ras_mgr_init_psp_config(adev, config: &init_config);
276 amdgpu_ras_mgr_init_umc_config(adev, config: &init_config);
277
278 return ras_core_create(init_config: &init_config);
279}
280
281static int amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block *ip_block)
282{
283 struct amdgpu_device *adev = ip_block->adev;
284 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
285 struct amdgpu_ras_mgr *ras_mgr;
286 int ret = 0;
287
288 /* Disabled by default */
289 con->uniras_enabled = false;
290
291 /* Enabled only in debug mode */
292 if (adev->debug_enable_ras_aca) {
293 con->uniras_enabled = true;
294 RAS_DEV_INFO(adev, "Debug amdgpu uniras!");
295 }
296
297 if (!con->uniras_enabled)
298 return 0;
299
300 ras_mgr = kzalloc(sizeof(*ras_mgr), GFP_KERNEL);
301 if (!ras_mgr)
302 return -EINVAL;
303
304 con->ras_mgr = ras_mgr;
305 ras_mgr->adev = adev;
306
307 ras_mgr->ras_core = amdgpu_ras_mgr_create_ras_core(adev);
308 if (!ras_mgr->ras_core) {
309 RAS_DEV_ERR(adev, "Failed to create ras core!\n");
310 ret = -EINVAL;
311 goto err;
312 }
313
314 ras_mgr->ras_core->dev = adev;
315
316 amdgpu_ras_process_init(adev);
317 ras_core_sw_init(ras_core: ras_mgr->ras_core);
318 amdgpu_ras_mgr_init_event_mgr(ras_core: ras_mgr->ras_core);
319 return 0;
320
321err:
322 kfree(objp: ras_mgr);
323 return ret;
324}
325
326static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block)
327{
328 struct amdgpu_device *adev = ip_block->adev;
329 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
330 struct amdgpu_ras_mgr *ras_mgr = (struct amdgpu_ras_mgr *)con->ras_mgr;
331
332 if (!con->uniras_enabled)
333 return 0;
334
335 if (!ras_mgr)
336 return 0;
337
338 amdgpu_ras_process_fini(adev);
339 ras_core_sw_fini(ras_core: ras_mgr->ras_core);
340 ras_core_destroy(ras_core: ras_mgr->ras_core);
341 ras_mgr->ras_core = NULL;
342
343 kfree(objp: con->ras_mgr);
344 con->ras_mgr = NULL;
345
346 return 0;
347}
348
349static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block)
350{
351 struct amdgpu_device *adev = ip_block->adev;
352 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
353 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
354 int ret;
355
356 if (!con->uniras_enabled)
357 return 0;
358
359 if (!ras_mgr || !ras_mgr->ras_core)
360 return -EINVAL;
361
362 ret = ras_core_hw_init(ras_core: ras_mgr->ras_core);
363 if (ret) {
364 RAS_DEV_ERR(adev, "Failed to initialize ras core!\n");
365 return ret;
366 }
367
368 ras_mgr->ras_is_ready = true;
369
370 amdgpu_enable_uniras(adev, enable: true);
371
372 RAS_DEV_INFO(adev, "AMDGPU RAS Is Ready.\n");
373 return 0;
374}
375
376static int amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block *ip_block)
377{
378 struct amdgpu_device *adev = ip_block->adev;
379 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
380 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
381
382 if (!con->uniras_enabled)
383 return 0;
384
385 if (!ras_mgr || !ras_mgr->ras_core)
386 return -EINVAL;
387
388 ras_core_hw_fini(ras_core: ras_mgr->ras_core);
389
390 ras_mgr->ras_is_ready = false;
391
392 return 0;
393}
394
395struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(struct amdgpu_device *adev)
396{
397 if (!adev || !adev->psp.ras_context.ras)
398 return NULL;
399
400 return (struct amdgpu_ras_mgr *)adev->psp.ras_context.ras->ras_mgr;
401}
402
403static const struct amd_ip_funcs __maybe_unused ras_v1_0_ip_funcs = {
404 .name = "ras_v1_0",
405 .sw_init = amdgpu_ras_mgr_sw_init,
406 .sw_fini = amdgpu_ras_mgr_sw_fini,
407 .hw_init = amdgpu_ras_mgr_hw_init,
408 .hw_fini = amdgpu_ras_mgr_hw_fini,
409};
410
411const struct amdgpu_ip_block_version ras_v1_0_ip_block = {
412 .type = AMD_IP_BLOCK_TYPE_RAS,
413 .major = 1,
414 .minor = 0,
415 .rev = 0,
416 .funcs = &ras_v1_0_ip_funcs,
417};
418
419int amdgpu_enable_uniras(struct amdgpu_device *adev, bool enable)
420{
421 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
422
423 if (!ras_mgr || !ras_mgr->ras_core)
424 return -EPERM;
425
426 if (amdgpu_sriov_vf(adev))
427 return -EPERM;
428
429 RAS_DEV_INFO(adev, "Enable amdgpu unified ras!");
430 return ras_core_set_status(ras_core: ras_mgr->ras_core, enable);
431}
432
433bool amdgpu_uniras_enabled(struct amdgpu_device *adev)
434{
435 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
436
437 if (!ras_mgr || !ras_mgr->ras_core)
438 return false;
439
440 if (amdgpu_sriov_vf(adev))
441 return false;
442
443 return ras_core_is_enabled(ras_core: ras_mgr->ras_core);
444}
445
446static bool amdgpu_ras_mgr_is_ready(struct amdgpu_device *adev)
447{
448 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
449
450 if (ras_mgr && ras_mgr->ras_core && ras_mgr->ras_is_ready &&
451 ras_core_is_ready(ras_core: ras_mgr->ras_core))
452 return true;
453
454 return false;
455}
456
457int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data)
458{
459 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
460
461 if (!amdgpu_ras_mgr_is_ready(adev))
462 return -EPERM;
463
464 return ras_core_handle_nbio_irq(ras_core: ras_mgr->ras_core, data);
465}
466
467uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev,
468 enum ras_seqno_type seqno_type)
469{
470 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
471 int ret;
472 uint64_t seq_no;
473
474 if (!amdgpu_ras_mgr_is_ready(adev) ||
475 (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
476 return 0;
477
478 seq_no = ras_core_gen_seqno(ras_core: ras_mgr->ras_core, seqno_type);
479
480 if ((seqno_type == RAS_SEQNO_TYPE_DE) ||
481 (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)) {
482 ret = ras_core_put_seqno(ras_core: ras_mgr->ras_core, seqno_type, seqno: seq_no);
483 if (ret)
484 RAS_DEV_WARN(adev, "There are too many ras interrupts!");
485 }
486
487 return seq_no;
488}
489
490int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data)
491{
492 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
493 struct ras_ih_info *ih_info = (struct ras_ih_info *)data;
494 uint64_t seq_no = 0;
495 int ret = 0;
496
497 if (!amdgpu_ras_mgr_is_ready(adev))
498 return -EPERM;
499
500 if (ih_info && (ih_info->block == AMDGPU_RAS_BLOCK__UMC)) {
501 if (ras_mgr->ras_core->poison_supported) {
502 seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, seqno_type: RAS_SEQNO_TYPE_DE);
503 RAS_DEV_INFO(adev,
504 "{%llu} RAS poison is created, no user action is needed.\n",
505 seq_no);
506 }
507
508 ret = amdgpu_ras_process_handle_umc_interrupt(adev, data: ih_info);
509 } else if (ras_mgr->ras_core->poison_supported) {
510 ret = amdgpu_ras_process_handle_unexpected_interrupt(adev, data: ih_info);
511 } else {
512 RAS_DEV_WARN(adev,
513 "No RAS interrupt handler for non-UMC block with poison disabled.\n");
514 }
515
516 return ret;
517}
518
519int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data)
520{
521 if (!amdgpu_ras_mgr_is_ready(adev))
522 return -EPERM;
523
524 return amdgpu_ras_process_handle_consumption_interrupt(adev, data);
525}
526
527int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev)
528{
529 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
530
531 if (!amdgpu_ras_mgr_is_ready(adev))
532 return -EPERM;
533
534 return ras_core_update_ecc_info(ras_core: ras_mgr->ras_core);
535}
536
537int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags)
538{
539 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
540
541 if (!amdgpu_ras_mgr_is_ready(adev))
542 return -EPERM;
543
544 con->gpu_reset_flags |= flags;
545 return amdgpu_ras_reset_gpu(adev);
546}
547
548bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev)
549{
550 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
551
552 if (!amdgpu_ras_mgr_is_ready(adev))
553 return false;
554
555 return ras_eeprom_check_safety_watermark(ras_core: ras_mgr->ras_core);
556}
557
558int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev,
559 uint32_t *nps_mode)
560{
561 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
562 uint32_t mode;
563
564 if (!amdgpu_ras_mgr_is_ready(adev))
565 return -EINVAL;
566
567 mode = ras_core_get_curr_nps_mode(ras_core: ras_mgr->ras_core);
568 if (!mode || mode > AMDGPU_NPS8_PARTITION_MODE)
569 return -EINVAL;
570
571 *nps_mode = mode;
572
573 return 0;
574}
575
576bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev,
577 uint64_t addr)
578{
579 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
580
581 if (!amdgpu_ras_mgr_is_ready(adev))
582 return false;
583
584 return ras_umc_check_retired_addr(ras_core: ras_mgr->ras_core, addr);
585}
586
587bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev)
588{
589 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
590
591 if (!ras_mgr || !ras_mgr->ras_core || !ras_mgr->ras_is_ready)
592 return false;
593
594 return ras_core_gpu_is_rma(ras_core: ras_mgr->ras_core);
595}
596
597int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
598 uint32_t cmd_id, void *input, uint32_t input_size,
599 void *output, uint32_t out_size)
600{
601 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
602 struct ras_cmd_ctx *cmd_ctx;
603 uint32_t ctx_buf_size = PAGE_SIZE;
604 int ret;
605
606 if (!amdgpu_ras_mgr_is_ready(adev))
607 return -EPERM;
608
609 cmd_ctx = kzalloc(ctx_buf_size, GFP_KERNEL);
610 if (!cmd_ctx)
611 return -ENOMEM;
612
613 cmd_ctx->cmd_id = cmd_id;
614
615 memcpy(cmd_ctx->input_buff_raw, input, input_size);
616 cmd_ctx->input_size = input_size;
617 cmd_ctx->output_buf_size = ctx_buf_size - sizeof(*cmd_ctx);
618
619 ret = amdgpu_ras_submit_cmd(ras_core: ras_mgr->ras_core, cmd: cmd_ctx);
620 if (!ret && !cmd_ctx->cmd_res && output && (out_size == cmd_ctx->output_size))
621 memcpy(output, cmd_ctx->output_buff_raw, cmd_ctx->output_size);
622
623 kfree(objp: cmd_ctx);
624
625 return ret;
626}
627
628int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
629{
630 if (!amdgpu_ras_mgr_is_ready(adev)) {
631 RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
632 return -EPERM;
633 }
634
635 amdgpu_ras_process_pre_reset(adev);
636 return 0;
637}
638
639int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
640{
641 if (!amdgpu_ras_mgr_is_ready(adev)) {
642 RAS_DEV_ERR(adev, "Invalid ras resume!\n");
643 return -EPERM;
644 }
645
646 amdgpu_ras_process_post_reset(adev);
647 return 0;
648}
649

source code of linux/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c