1/* SPDX-License-Identifier: MIT */
2/*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25#ifndef __RAS_CMD_H__
26#define __RAS_CMD_H__
27#include "ras.h"
28#include "ras_eeprom.h"
29#include "ras_log_ring.h"
30#include "ras_cper.h"
31
32#define RAS_CMD_DEV_HANDLE_MAGIC 0xFEEDAD00UL
33
34#define RAS_CMD_MAX_IN_SIZE 256
35#define RAS_CMD_MAX_GPU_NUM 32
36#define RAS_CMD_MAX_BAD_PAGES_PER_GROUP 32
37
38/* position of instance value in sub_block_index of
39 * ta_ras_trigger_error_input, the sub block uses lower 12 bits
40 */
41#define RAS_TA_INST_MASK 0xfffff000
42#define RAS_TA_INST_SHIFT 0xc
43
44enum ras_cmd_interface_type {
45 RAS_CMD_INTERFACE_TYPE_NONE,
46 RAS_CMD_INTERFACE_TYPE_AMDGPU,
47 RAS_CMD_INTERFACE_TYPE_VF,
48 RAS_CMD_INTERFACE_TYPE_PF,
49};
50
51enum ras_cmd_id_range {
52 RAS_CMD_ID_COMMON_START = 0,
53 RAS_CMD_ID_COMMON_END = 0x10000,
54 RAS_CMD_ID_AMDGPU_START = RAS_CMD_ID_COMMON_END,
55 RAS_CMD_ID_AMDGPU_END = 0x20000,
56 RAS_CMD_ID_MXGPU_START = RAS_CMD_ID_AMDGPU_END,
57 RAS_CMD_ID_MXGPU_END = 0x30000,
58 RAS_CMD_ID_MXGPU_VF_START = RAS_CMD_ID_MXGPU_END,
59 RAS_CMD_ID_MXGPU_VF_END = 0x40000,
60};
61
62enum ras_cmd_id {
63 RAS_CMD__BEGIN = RAS_CMD_ID_COMMON_START,
64 RAS_CMD__QUERY_INTERFACE_INFO,
65 RAS_CMD__GET_DEVICES_INFO,
66 RAS_CMD__GET_BLOCK_ECC_STATUS,
67 RAS_CMD__INJECT_ERROR,
68 RAS_CMD__GET_BAD_PAGES,
69 RAS_CMD__CLEAR_BAD_PAGE_INFO,
70 RAS_CMD__RESET_ALL_ERROR_COUNTS,
71 RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES,
72 RAS_CMD__TRANSLATE_FB_ADDRESS,
73 RAS_CMD__GET_LINK_TOPOLOGY,
74 RAS_CMD__GET_CPER_SNAPSHOT,
75 RAS_CMD__GET_CPER_RECORD,
76 RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
77 RAS_CMD__GET_BATCH_TRACE_RECORD,
78 RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END,
79};
80
81enum ras_cmd_response {
82 RAS_CMD__SUCCESS = 0,
83 RAS_CMD__SUCCESS_EXEED_BUFFER,
84 RAS_CMD__ERROR_UKNOWN_CMD,
85 RAS_CMD__ERROR_INVALID_CMD,
86 RAS_CMD__ERROR_VERSION,
87 RAS_CMD__ERROR_INVALID_INPUT_SIZE,
88 RAS_CMD__ERROR_INVALID_INPUT_DATA,
89 RAS_CMD__ERROR_DRV_INIT_FAIL,
90 RAS_CMD__ERROR_ACCESS_DENIED,
91 RAS_CMD__ERROR_GENERIC,
92 RAS_CMD__ERROR_TIMEOUT,
93};
94
95enum ras_error_type {
96 RAS_TYPE_ERROR__NONE = 0,
97 RAS_TYPE_ERROR__PARITY = 1,
98 RAS_TYPE_ERROR__SINGLE_CORRECTABLE = 2,
99 RAS_TYPE_ERROR__MULTI_UNCORRECTABLE = 4,
100 RAS_TYPE_ERROR__POISON = 8,
101};
102
103struct ras_core_context;
104struct ras_cmd_ctx;
105
106struct ras_cmd_mgr {
107 struct list_head head;
108 struct ras_core_context *ras_core;
109 uint64_t dev_handle;
110};
111
112struct ras_cmd_func_map {
113 uint32_t cmd_id;
114 int (*func)(struct ras_core_context *ras_core,
115 struct ras_cmd_ctx *cmd, void *data);
116};
117
118struct ras_device_bdf {
119 union {
120 struct {
121 uint32_t function : 3;
122 uint32_t device : 5;
123 uint32_t bus : 8;
124 uint32_t domain : 16;
125 };
126 uint32_t u32_all;
127 };
128};
129
130struct ras_cmd_param {
131 uint32_t idx_vf;
132 void *data;
133};
134
135#pragma pack(push, 8)
136struct ras_cmd_ctx {
137 uint32_t magic;
138 union {
139 struct {
140 uint16_t ras_cmd_minor_ver : 10;
141 uint16_t ras_cmd_major_ver : 6;
142 };
143 uint16_t ras_cmd_ver;
144 };
145 union {
146 struct {
147 uint16_t plat_major_ver : 10;
148 uint16_t plat_minor_ver : 6;
149 };
150 uint16_t plat_ver;
151 };
152 uint32_t cmd_id;
153 uint32_t cmd_res;
154 uint32_t input_size;
155 uint32_t output_size;
156 uint32_t output_buf_size;
157 uint32_t reserved[5];
158 uint8_t input_buff_raw[RAS_CMD_MAX_IN_SIZE];
159 uint8_t output_buff_raw[];
160};
161
162struct ras_cmd_dev_handle {
163 uint64_t dev_handle;
164};
165
166struct ras_cmd_block_ecc_info_req {
167 struct ras_cmd_dev_handle dev;
168 uint32_t block_id;
169 uint32_t subblock_id;
170 uint32_t reserved[4];
171};
172
173struct ras_cmd_block_ecc_info_rsp {
174 uint32_t version;
175 uint32_t ce_count;
176 uint32_t ue_count;
177 uint32_t de_count;
178 uint32_t reserved[6];
179};
180
181struct ras_cmd_inject_error_req {
182 struct ras_cmd_dev_handle dev;
183 uint32_t block_id;
184 uint32_t subblock_id;
185 uint64_t address;
186 uint32_t error_type;
187 uint32_t instance_mask;
188 union {
189 struct {
190 /* vf index */
191 uint64_t vf_idx : 6;
192 /* method of error injection. i.e persistent, coherent etc */
193 uint64_t method : 10;
194 uint64_t rsv : 48;
195 };
196 uint64_t value;
197 };
198 uint32_t reserved[8];
199};
200
201struct ras_cmd_inject_error_rsp {
202 uint32_t version;
203 uint32_t reserved[5];
204 uint64_t address;
205};
206
207struct ras_cmd_dev_info {
208 uint64_t dev_handle;
209 uint32_t location_id;
210 uint32_t ecc_enabled;
211 uint32_t ecc_supported;
212 uint32_t vf_num;
213 uint32_t asic_type;
214 uint32_t oam_id;
215 uint32_t reserved[8];
216};
217
218struct ras_cmd_devices_info_rsp {
219 uint32_t version;
220 uint32_t dev_num;
221 uint32_t reserved[6];
222 struct ras_cmd_dev_info devs[RAS_CMD_MAX_GPU_NUM];
223};
224
225struct ras_cmd_bad_page_record {
226 union {
227 uint64_t address;
228 uint64_t offset;
229 };
230 uint64_t retired_page;
231 uint64_t ts;
232
233 uint32_t err_type;
234
235 union {
236 unsigned char bank;
237 unsigned char cu;
238 };
239
240 unsigned char mem_channel;
241 unsigned char mcumc_id;
242
243 unsigned char valid;
244 unsigned char reserved[8];
245};
246
247struct ras_cmd_bad_pages_info_req {
248 struct ras_cmd_dev_handle device;
249 uint32_t group_index;
250 uint32_t reserved[5];
251};
252
253struct ras_cmd_bad_pages_info_rsp {
254 uint32_t version;
255 uint32_t group_index;
256 uint32_t bp_in_group;
257 uint32_t bp_total_cnt;
258 uint32_t reserved[4];
259 struct ras_cmd_bad_page_record records[RAS_CMD_MAX_BAD_PAGES_PER_GROUP];
260};
261
262struct ras_query_interface_info_req {
263 uint32_t reserved[8];
264};
265
266struct ras_query_interface_info_rsp {
267 uint32_t version;
268 uint32_t ras_cmd_major_ver;
269 uint32_t ras_cmd_minor_ver;
270 uint32_t plat_major_ver;
271 uint32_t plat_minor_ver;
272 uint8_t interface_type;
273 uint8_t rsv[3];
274 uint32_t reserved[8];
275};
276
277#define RAS_MAX_NUM_SAFE_RANGES 64
278struct ras_cmd_ras_safe_fb_address_ranges_rsp {
279 uint32_t version;
280 uint32_t num_ranges;
281 uint32_t reserved[4];
282 struct {
283 uint64_t start;
284 uint64_t size;
285 uint32_t idx;
286 uint32_t reserved[3];
287 } range[RAS_MAX_NUM_SAFE_RANGES];
288};
289
290enum ras_fb_addr_type {
291 RAS_FB_ADDR_SOC_PHY, /* SPA */
292 RAS_FB_ADDR_BANK,
293 RAS_FB_ADDR_VF_PHY, /* GPA */
294 RAS_FB_ADDR_UNKNOWN
295};
296
297struct ras_fb_bank_addr {
298 uint32_t stack_id; /* SID */
299 uint32_t bank_group;
300 uint32_t bank;
301 uint32_t row;
302 uint32_t column;
303 uint32_t channel;
304 uint32_t subchannel; /* Also called Pseudochannel (PC) */
305 uint32_t reserved[3];
306};
307
308struct ras_fb_vf_phy_addr {
309 uint32_t vf_idx;
310 uint32_t reserved;
311 uint64_t addr;
312};
313
314union ras_translate_fb_address {
315 struct ras_fb_bank_addr bank_addr;
316 uint64_t soc_phy_addr;
317 struct ras_fb_vf_phy_addr vf_phy_addr;
318};
319
320struct ras_cmd_translate_fb_address_req {
321 struct ras_cmd_dev_handle dev;
322 enum ras_fb_addr_type src_addr_type;
323 enum ras_fb_addr_type dest_addr_type;
324 union ras_translate_fb_address trans_addr;
325};
326
327struct ras_cmd_translate_fb_address_rsp {
328 uint32_t version;
329 uint32_t reserved[5];
330 union ras_translate_fb_address trans_addr;
331};
332
333struct ras_dev_link_topology_req {
334 struct ras_cmd_dev_handle src;
335 struct ras_cmd_dev_handle dst;
336};
337
338struct ras_dev_link_topology_rsp {
339 uint32_t version;
340 uint32_t link_status; /* HW status of the link */
341 uint32_t link_type; /* type of the link */
342 uint32_t num_hops; /* number of hops */
343 uint32_t reserved[8];
344};
345
346struct ras_cmd_cper_snapshot_req {
347 struct ras_cmd_dev_handle dev;
348};
349
350struct ras_cmd_cper_snapshot_rsp {
351 uint32_t version;
352 uint32_t reserved[4];
353 uint32_t total_cper_num;
354 uint64_t start_cper_id;
355 uint64_t latest_cper_id;
356};
357
358struct ras_cmd_cper_record_req {
359 struct ras_cmd_dev_handle dev;
360 uint64_t cper_start_id;
361 uint32_t cper_num;
362 uint32_t buf_size;
363 uint64_t buf_ptr;
364 uint32_t reserved[4];
365};
366
367struct ras_cmd_cper_record_rsp {
368 uint32_t version;
369 uint32_t real_data_size;
370 uint32_t real_cper_num;
371 uint32_t remain_num;
372 uint32_t reserved[4];
373};
374
375struct ras_cmd_batch_trace_snapshot_req {
376 struct ras_cmd_dev_handle dev;
377};
378
379struct ras_cmd_batch_trace_snapshot_rsp {
380 uint32_t version;
381 uint32_t reserved[4];
382 uint32_t total_batch_num;
383 uint64_t start_batch_id;
384 uint64_t latest_batch_id;
385};
386
387struct ras_cmd_batch_trace_record_req {
388 struct ras_cmd_dev_handle dev;
389 uint64_t start_batch_id;
390 uint32_t batch_num;
391 uint32_t reserved[5];
392};
393
394struct batch_ras_trace_info {
395 uint64_t batch_id;
396 uint16_t offset;
397 uint8_t trace_num;
398 uint8_t rsv;
399 uint32_t reserved;
400};
401
402#define RAS_CMD_MAX_BATCH_NUM 300
403#define RAS_CMD_MAX_TRACE_NUM 300
404struct ras_cmd_batch_trace_record_rsp {
405 uint32_t version;
406 uint16_t real_batch_num;
407 uint16_t remain_num;
408 uint64_t start_batch_id;
409 uint32_t reserved[2];
410 struct batch_ras_trace_info batchs[RAS_CMD_MAX_BATCH_NUM];
411 struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM];
412};
413
414#pragma pack(pop)
415
416int ras_cmd_init(struct ras_core_context *ras_core);
417int ras_cmd_fini(struct ras_core_context *ras_core);
418int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data);
419uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core);
420int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
421 struct ras_query_interface_info_rsp *rsp);
422int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core,
423 uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr);
424int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core,
425 struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa);
426#endif
427

source code of linux/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h