1/* SPDX-License-Identifier: MIT */
2/*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25#ifndef __RAS_H__
26#define __RAS_H__
27#include "ras_sys.h"
28#include "ras_umc.h"
29#include "ras_aca.h"
30#include "ras_eeprom.h"
31#include "ras_core_status.h"
32#include "ras_process.h"
33#include "ras_gfx.h"
34#include "ras_cmd.h"
35#include "ras_nbio.h"
36#include "ras_mp1.h"
37#include "ras_psp.h"
38#include "ras_log_ring.h"
39
40#define RAS_HW_ERR "[Hardware Error]: "
41
42#define RAS_GPU_PAGE_SHIFT 12
43#define RAS_ADDR_TO_PFN(addr) ((addr) >> RAS_GPU_PAGE_SHIFT)
44#define RAS_PFN_TO_ADDR(pfn) ((pfn) << RAS_GPU_PAGE_SHIFT)
45
46#define RAS_CORE_RESET_GPU 0x10000
47
48#define GPU_RESET_CAUSE_POISON (RAS_CORE_RESET_GPU | 0x0001)
49#define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002)
50#define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004)
51
52enum ras_block_id {
53 RAS_BLOCK_ID__UMC = 0,
54 RAS_BLOCK_ID__SDMA,
55 RAS_BLOCK_ID__GFX,
56 RAS_BLOCK_ID__MMHUB,
57 RAS_BLOCK_ID__ATHUB,
58 RAS_BLOCK_ID__PCIE_BIF,
59 RAS_BLOCK_ID__HDP,
60 RAS_BLOCK_ID__XGMI_WAFL,
61 RAS_BLOCK_ID__DF,
62 RAS_BLOCK_ID__SMN,
63 RAS_BLOCK_ID__SEM,
64 RAS_BLOCK_ID__MP0,
65 RAS_BLOCK_ID__MP1,
66 RAS_BLOCK_ID__FUSE,
67 RAS_BLOCK_ID__MCA,
68 RAS_BLOCK_ID__VCN,
69 RAS_BLOCK_ID__JPEG,
70 RAS_BLOCK_ID__IH,
71 RAS_BLOCK_ID__MPIO,
72
73 RAS_BLOCK_ID__LAST
74};
75
76enum ras_ecc_err_type {
77 RAS_ECC_ERR__NONE = 0,
78 RAS_ECC_ERR__PARITY = 1,
79 RAS_ECC_ERR__SINGLE_CORRECTABLE = 2,
80 RAS_ECC_ERR__MULTI_UNCORRECTABLE = 4,
81 RAS_ECC_ERR__POISON = 8,
82};
83
84enum ras_err_type {
85 RAS_ERR_TYPE__UE = 0,
86 RAS_ERR_TYPE__CE,
87 RAS_ERR_TYPE__DE,
88 RAS_ERR_TYPE__LAST
89};
90
91enum ras_seqno_type {
92 RAS_SEQNO_TYPE_INVALID = 0,
93 RAS_SEQNO_TYPE_UE,
94 RAS_SEQNO_TYPE_CE,
95 RAS_SEQNO_TYPE_DE,
96 RAS_SEQNO_TYPE_POISON_CONSUMPTION,
97 RAS_SEQNO_TYPE_COUNT_MAX,
98};
99
100enum ras_seqno_fifo {
101 SEQNO_FIFO_INVALID = 0,
102 SEQNO_FIFO_POISON_CREATION,
103 SEQNO_FIFO_POISON_CONSUMPTION,
104 SEQNO_FIFO_COUNT_MAX
105};
106
107enum ras_notify_event {
108 RAS_EVENT_ID__NONE,
109 RAS_EVENT_ID__BAD_PAGE_DETECTED,
110 RAS_EVENT_ID__POISON_CONSUMPTION,
111 RAS_EVENT_ID__RESERVE_BAD_PAGE,
112 RAS_EVENT_ID__DEVICE_RMA,
113 RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM,
114 RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP,
115 RAS_EVENT_ID__FATAL_ERROR_DETECTED,
116 RAS_EVENT_ID__RESET_GPU,
117 RAS_EVENT_ID__RESET_VF,
118 RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN,
119 RAS_EVENT_ID__RAS_EVENT_PROC_END,
120};
121
122enum ras_gpu_status {
123 RAS_GPU_STATUS__NOT_READY = 0,
124 RAS_GPU_STATUS__READY = 0x1,
125 RAS_GPU_STATUS__IN_RESET = 0x2,
126 RAS_GPU_STATUS__IS_RMA = 0x4,
127 RAS_GPU_STATUS__IS_VF = 0x8,
128};
129
130struct ras_core_context;
131struct ras_bank_ecc;
132struct ras_umc;
133struct ras_aca;
134struct ras_process;
135struct ras_nbio;
136struct ras_log_ring;
137struct ras_psp;
138
139struct ras_mp1_sys_func {
140 int (*mp1_get_valid_bank_count)(struct ras_core_context *ras_core,
141 u32 msg, u32 *count);
142 int (*mp1_dump_valid_bank)(struct ras_core_context *ras_core,
143 u32 msg, u32 idx, u32 reg_idx, u64 *val);
144};
145
146struct ras_eeprom_sys_func {
147 int (*eeprom_i2c_xfer)(struct ras_core_context *ras_core,
148 u32 eeprom_addr, u8 *eeprom_buf, u32 buf_size, bool read);
149 int (*update_eeprom_i2c_config)(struct ras_core_context *ras_core);
150};
151
152struct ras_nbio_sys_func {
153 int (*set_ras_controller_irq_state)(struct ras_core_context *ras_core,
154 bool state);
155 int (*set_ras_err_event_athub_irq_state)(struct ras_core_context *ras_core,
156 bool state);
157};
158
159struct ras_time {
160 int tm_sec;
161 int tm_min;
162 int tm_hour;
163 int tm_mday;
164 int tm_mon;
165 long tm_year;
166};
167
168struct device_system_info {
169 uint32_t device_id;
170 uint32_t vendor_id;
171 uint32_t socket_id;
172};
173
174enum gpu_mem_type {
175 GPU_MEM_TYPE_DEFAULT,
176 GPU_MEM_TYPE_RAS_PSP_RING,
177 GPU_MEM_TYPE_RAS_PSP_CMD,
178 GPU_MEM_TYPE_RAS_PSP_FENCE,
179 GPU_MEM_TYPE_RAS_TA_FW,
180 GPU_MEM_TYPE_RAS_TA_CMD,
181};
182
183struct ras_psp_sys_func {
184 int (*get_ras_psp_system_status)(struct ras_core_context *ras_core,
185 struct ras_psp_sys_status *status);
186 int (*get_ras_ta_init_param)(struct ras_core_context *ras_core,
187 struct ras_ta_init_param *ras_ta_param);
188};
189
190struct ras_sys_func {
191 int (*gpu_reset_lock)(struct ras_core_context *ras_core,
192 bool down, bool try);
193 int (*check_gpu_status)(struct ras_core_context *ras_core,
194 uint32_t *status);
195 int (*gen_seqno)(struct ras_core_context *ras_core,
196 enum ras_seqno_type seqno_type, uint64_t *seqno);
197 int (*async_handle_ras_event)(struct ras_core_context *ras_core, void *data);
198 int (*ras_notifier)(struct ras_core_context *ras_core,
199 enum ras_notify_event event_id, void *data);
200 u64 (*get_utc_second_timestamp)(struct ras_core_context *ras_core);
201 int (*get_device_system_info)(struct ras_core_context *ras_core,
202 struct device_system_info *dev_info);
203 bool (*detect_ras_interrupt)(struct ras_core_context *ras_core);
204 int (*get_gpu_mem)(struct ras_core_context *ras_core,
205 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
206 int (*put_gpu_mem)(struct ras_core_context *ras_core,
207 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
208};
209
210struct ras_ecc_count {
211 uint64_t new_ce_count;
212 uint64_t total_ce_count;
213 uint64_t new_ue_count;
214 uint64_t total_ue_count;
215 uint64_t new_de_count;
216 uint64_t total_de_count;
217};
218
219struct ras_bank_ecc {
220 uint32_t nps;
221 uint64_t seq_no;
222 uint64_t status;
223 uint64_t ipid;
224 uint64_t addr;
225};
226
227struct ras_bank_ecc_node {
228 struct list_head node;
229 struct ras_bank_ecc ecc;
230};
231
232struct ras_aca_config {
233 u32 socket_num_per_hive;
234 u32 aid_num_per_socket;
235 u32 xcd_num_per_aid;
236};
237
238struct ras_mp1_config {
239 const struct ras_mp1_sys_func *mp1_sys_fn;
240};
241
242struct ras_nbio_config {
243 const struct ras_nbio_sys_func *nbio_sys_fn;
244};
245
246struct ras_psp_config {
247 const struct ras_psp_sys_func *psp_sys_fn;
248};
249
250struct ras_umc_config {
251 uint32_t umc_vram_type;
252};
253
254struct ras_eeprom_config {
255 const struct ras_eeprom_sys_func *eeprom_sys_fn;
256 int eeprom_record_threshold_config;
257 uint32_t eeprom_record_threshold_count;
258 void *eeprom_i2c_adapter;
259 u32 eeprom_i2c_addr;
260 u32 eeprom_i2c_port;
261 u16 max_i2c_read_len;
262 u16 max_i2c_write_len;
263};
264
265struct ras_core_config {
266 u32 aca_ip_version;
267 u32 umc_ip_version;
268 u32 mp1_ip_version;
269 u32 gfx_ip_version;
270 u32 nbio_ip_version;
271 u32 psp_ip_version;
272
273 bool poison_supported;
274 bool ras_eeprom_supported;
275 const struct ras_sys_func *sys_fn;
276
277 struct ras_aca_config aca_cfg;
278 struct ras_mp1_config mp1_cfg;
279 struct ras_nbio_config nbio_cfg;
280 struct ras_psp_config psp_cfg;
281 struct ras_eeprom_config eeprom_cfg;
282 struct ras_umc_config umc_cfg;
283};
284
285struct ras_core_context {
286 void *dev;
287 struct ras_core_config *config;
288 u32 socket_num_per_hive;
289 u32 aid_num_per_socket;
290 u32 xcd_num_per_aid;
291 int max_ue_banks_per_query;
292 int max_ce_banks_per_query;
293 struct ras_aca ras_aca;
294
295 bool ras_eeprom_supported;
296 struct ras_eeprom_control ras_eeprom;
297
298 struct ras_psp ras_psp;
299 struct ras_umc ras_umc;
300 struct ras_nbio ras_nbio;
301 struct ras_gfx ras_gfx;
302 struct ras_mp1 ras_mp1;
303 struct ras_process ras_proc;
304 struct ras_cmd_mgr ras_cmd;
305 struct ras_log_ring ras_log_ring;
306
307 const struct ras_sys_func *sys_fn;
308
309 /* is poison mode supported */
310 bool poison_supported;
311
312 bool is_rma;
313 bool is_initialized;
314
315 struct kfifo de_seqno_fifo;
316 struct kfifo consumption_seqno_fifo;
317 spinlock_t seqno_lock;
318
319 bool ras_core_enabled;
320};
321
322struct ras_core_context *ras_core_create(struct ras_core_config *init_config);
323void ras_core_destroy(struct ras_core_context *ras_core);
324int ras_core_sw_init(struct ras_core_context *ras_core);
325int ras_core_sw_fini(struct ras_core_context *ras_core);
326int ras_core_hw_init(struct ras_core_context *ras_core);
327int ras_core_hw_fini(struct ras_core_context *ras_core);
328bool ras_core_is_ready(struct ras_core_context *ras_core);
329uint64_t ras_core_gen_seqno(struct ras_core_context *ras_core,
330 enum ras_seqno_type seqno_type);
331uint64_t ras_core_get_seqno(struct ras_core_context *ras_core,
332 enum ras_seqno_type seqno_type, bool pop);
333
334int ras_core_put_seqno(struct ras_core_context *ras_core,
335 enum ras_seqno_type seqno_type, uint64_t seqno);
336
337int ras_core_update_ecc_info(struct ras_core_context *ras_core);
338int ras_core_query_block_ecc_data(struct ras_core_context *ras_core,
339 enum ras_block_id block, struct ras_ecc_count *ecc_count);
340
341bool ras_core_gpu_in_reset(struct ras_core_context *ras_core);
342bool ras_core_gpu_is_rma(struct ras_core_context *ras_core);
343bool ras_core_gpu_is_vf(struct ras_core_context *ras_core);
344bool ras_core_handle_nbio_irq(struct ras_core_context *ras_core, void *data);
345int ras_core_handle_fatal_error(struct ras_core_context *ras_core);
346
347uint32_t ras_core_get_curr_nps_mode(struct ras_core_context *ras_core);
348const char *ras_core_get_ras_block_name(enum ras_block_id block_id);
349int ras_core_convert_timestamp_to_time(struct ras_core_context *ras_core,
350 uint64_t timestamp, struct ras_time *tm);
351
352int ras_core_set_status(struct ras_core_context *ras_core, bool enable);
353bool ras_core_is_enabled(struct ras_core_context *ras_core);
354uint64_t ras_core_get_utc_second_timestamp(struct ras_core_context *ras_core);
355int ras_core_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
356 uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa);
357bool ras_core_ras_interrupt_detected(struct ras_core_context *ras_core);
358int ras_core_get_gpu_mem(struct ras_core_context *ras_core,
359 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
360int ras_core_put_gpu_mem(struct ras_core_context *ras_core,
361 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
362bool ras_core_check_safety_watermark(struct ras_core_context *ras_core);
363int ras_core_down_trylock_gpu_reset_lock(struct ras_core_context *ras_core);
364void ras_core_down_gpu_reset_lock(struct ras_core_context *ras_core);
365void ras_core_up_gpu_reset_lock(struct ras_core_context *ras_core);
366int ras_core_event_notify(struct ras_core_context *ras_core,
367 enum ras_notify_event event_id, void *data);
368int ras_core_get_device_system_info(struct ras_core_context *ras_core,
369 struct device_system_info *dev_info);
370#endif
371

source code of linux/drivers/gpu/drm/amd/ras/rascore/ras.h