| 1 | // SPDX-License-Identifier: MIT |
| 2 | /* |
| 3 | * Copyright 2025 Advanced Micro Devices, Inc. |
| 4 | * |
| 5 | * Permission is hereby granted, free of charge, to any person obtaining a |
| 6 | * copy of this software and associated documentation files (the "Software"), |
| 7 | * to deal in the Software without restriction, including without limitation |
| 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 9 | * and/or sell copies of the Software, and to permit persons to whom the |
| 10 | * Software is furnished to do so, subject to the following conditions: |
| 11 | * |
| 12 | * The above copyright notice and this permission notice shall be included in |
| 13 | * all copies or substantial portions of the Software. |
| 14 | * |
| 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| 18 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| 19 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| 20 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 21 | * OTHER DEALINGS IN THE SOFTWARE. |
| 22 | * |
| 23 | */ |
| 24 | #include "ras.h" |
| 25 | #include "ras_core_status.h" |
| 26 | #include "ras_log_ring.h" |
| 27 | #include "ras_cper.h" |
| 28 | |
| 29 | static const struct ras_cper_guid MCE = CPER_NOTIFY__MCE; |
| 30 | static const struct ras_cper_guid CMC = CPER_NOTIFY__CMC; |
| 31 | static const struct ras_cper_guid BOOT = BOOT__TYPE; |
| 32 | |
| 33 | static const struct ras_cper_guid CRASHDUMP = GPU__CRASHDUMP; |
| 34 | static const struct ras_cper_guid RUNTIME = GPU__NONSTANDARD_ERROR; |
| 35 | |
| 36 | static void cper_get_timestamp(struct ras_core_context *ras_core, |
| 37 | struct ras_cper_timestamp *timestamp, uint64_t utc_second_timestamp) |
| 38 | { |
| 39 | struct ras_time tm = {0}; |
| 40 | |
| 41 | ras_core_convert_timestamp_to_time(ras_core, timestamp: utc_second_timestamp, tm: &tm); |
| 42 | timestamp->seconds = tm.tm_sec; |
| 43 | timestamp->minutes = tm.tm_min; |
| 44 | timestamp->hours = tm.tm_hour; |
| 45 | timestamp->flag = 0; |
| 46 | timestamp->day = tm.tm_mday; |
| 47 | timestamp->month = tm.tm_mon; |
| 48 | timestamp->year = tm.tm_year % 100; |
| 49 | timestamp->century = tm.tm_year / 100; |
| 50 | } |
| 51 | |
| 52 | static void fill_section_hdr(struct ras_core_context *ras_core, |
| 53 | struct cper_section_hdr *hdr, enum ras_cper_type type, |
| 54 | enum ras_cper_severity sev, struct ras_log_info *trace) |
| 55 | { |
| 56 | struct device_system_info dev_info = {0}; |
| 57 | char record_id[32]; |
| 58 | |
| 59 | hdr->signature[0] = 'C'; |
| 60 | hdr->signature[1] = 'P'; |
| 61 | hdr->signature[2] = 'E'; |
| 62 | hdr->signature[3] = 'R'; |
| 63 | hdr->revision = CPER_HDR__REV_1; |
| 64 | hdr->signature_end = 0xFFFFFFFF; |
| 65 | hdr->error_severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev); |
| 66 | |
| 67 | hdr->valid_bits.platform_id = 1; |
| 68 | hdr->valid_bits.timestamp = 1; |
| 69 | |
| 70 | ras_core_get_device_system_info(ras_core, dev_info: &dev_info); |
| 71 | |
| 72 | cper_get_timestamp(ras_core, timestamp: &hdr->timestamp, utc_second_timestamp: trace->timestamp); |
| 73 | |
| 74 | snprintf(buf: record_id, size: sizeof(record_id), fmt: "%d:%llX" , dev_info.socket_id, |
| 75 | RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno)); |
| 76 | memcpy(hdr->record_id, record_id, 8); |
| 77 | |
| 78 | snprintf(buf: hdr->platform_id, size: 16, fmt: "0x%04X:0x%04X" , |
| 79 | dev_info.vendor_id, dev_info.device_id); |
| 80 | /* pmfw version should be part of creator_id according to CPER spec */ |
| 81 | snprintf(buf: hdr->creator_id, size: 16, fmt: "%s" , CPER_CREATOR_ID__AMDGPU); |
| 82 | |
| 83 | switch (type) { |
| 84 | case RAS_CPER_TYPE_BOOT: |
| 85 | hdr->notify_type = BOOT; |
| 86 | break; |
| 87 | case RAS_CPER_TYPE_FATAL: |
| 88 | case RAS_CPER_TYPE_RMA: |
| 89 | hdr->notify_type = MCE; |
| 90 | break; |
| 91 | case RAS_CPER_TYPE_RUNTIME: |
| 92 | if (sev == RAS_CPER_SEV_NON_FATAL_CE) |
| 93 | hdr->notify_type = CMC; |
| 94 | else |
| 95 | hdr->notify_type = MCE; |
| 96 | break; |
| 97 | default: |
| 98 | RAS_DEV_ERR(ras_core->dev, "Unknown CPER Type\n" ); |
| 99 | break; |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | static int fill_section_descriptor(struct ras_core_context *ras_core, |
| 104 | struct cper_section_descriptor *descriptor, |
| 105 | enum ras_cper_severity sev, |
| 106 | struct ras_cper_guid sec_type, |
| 107 | uint32_t section_offset, |
| 108 | uint32_t section_length) |
| 109 | { |
| 110 | struct device_system_info dev_info = {0}; |
| 111 | |
| 112 | descriptor->revision_minor = CPER_SEC__MINOR_REV_1; |
| 113 | descriptor->revision_major = CPER_SEC__MAJOR_REV_22; |
| 114 | descriptor->sec_offset = section_offset; |
| 115 | descriptor->sec_length = section_length; |
| 116 | descriptor->valid_bits.fru_text = 1; |
| 117 | descriptor->flag_bits.primary = 1; |
| 118 | descriptor->severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev); |
| 119 | descriptor->sec_type = sec_type; |
| 120 | |
| 121 | ras_core_get_device_system_info(ras_core, dev_info: &dev_info); |
| 122 | |
| 123 | snprintf(buf: descriptor->fru_text, size: 20, fmt: "OAM%d" , dev_info.socket_id); |
| 124 | |
| 125 | if (sev == RAS_CPER_SEV_RMA) |
| 126 | descriptor->flag_bits.exceed_err_threshold = 1; |
| 127 | |
| 128 | if (sev == RAS_CPER_SEV_NON_FATAL_UE) |
| 129 | descriptor->flag_bits.latent_err = 1; |
| 130 | |
| 131 | return 0; |
| 132 | } |
| 133 | |
| 134 | static int fill_section_fatal(struct ras_core_context *ras_core, |
| 135 | struct cper_section_fatal *fatal, struct ras_log_info *trace) |
| 136 | { |
| 137 | fatal->data.reg_ctx_type = CPER_CTX_TYPE__CRASH; |
| 138 | fatal->data.reg_arr_size = sizeof(fatal->data.reg); |
| 139 | |
| 140 | fatal->data.reg.status = trace->aca_reg.regs[RAS_CPER_ACA_REG_STATUS]; |
| 141 | fatal->data.reg.addr = trace->aca_reg.regs[RAS_CPER_ACA_REG_ADDR]; |
| 142 | fatal->data.reg.ipid = trace->aca_reg.regs[RAS_CPER_ACA_REG_IPID]; |
| 143 | fatal->data.reg.synd = trace->aca_reg.regs[RAS_CPER_ACA_REG_SYND]; |
| 144 | |
| 145 | return 0; |
| 146 | } |
| 147 | |
| 148 | static int fill_section_runtime(struct ras_core_context *ras_core, |
| 149 | struct cper_section_runtime *runtime, struct ras_log_info *trace, |
| 150 | enum ras_cper_severity sev) |
| 151 | { |
| 152 | runtime->hdr.valid_bits.err_info_cnt = 1; |
| 153 | runtime->hdr.valid_bits.err_context_cnt = 1; |
| 154 | |
| 155 | runtime->descriptor.error_type = RUNTIME; |
| 156 | runtime->descriptor.ms_chk_bits.err_type_valid = 1; |
| 157 | if (sev == RAS_CPER_SEV_RMA) { |
| 158 | runtime->descriptor.valid_bits.ms_chk = 1; |
| 159 | runtime->descriptor.ms_chk_bits.err_type = 1; |
| 160 | runtime->descriptor.ms_chk_bits.pcc = 1; |
| 161 | } |
| 162 | |
| 163 | runtime->reg.reg_ctx_type = CPER_CTX_TYPE__CRASH; |
| 164 | runtime->reg.reg_arr_size = sizeof(runtime->reg.reg_dump); |
| 165 | |
| 166 | runtime->reg.reg_dump[RAS_CPER_ACA_REG_CTL] = trace->aca_reg.regs[ACA_REG_IDX__CTL]; |
| 167 | runtime->reg.reg_dump[RAS_CPER_ACA_REG_STATUS] = trace->aca_reg.regs[ACA_REG_IDX__STATUS]; |
| 168 | runtime->reg.reg_dump[RAS_CPER_ACA_REG_ADDR] = trace->aca_reg.regs[ACA_REG_IDX__ADDR]; |
| 169 | runtime->reg.reg_dump[RAS_CPER_ACA_REG_MISC0] = trace->aca_reg.regs[ACA_REG_IDX__MISC0]; |
| 170 | runtime->reg.reg_dump[RAS_CPER_ACA_REG_CONFIG] = trace->aca_reg.regs[ACA_REG_IDX__CONFG]; |
| 171 | runtime->reg.reg_dump[RAS_CPER_ACA_REG_IPID] = trace->aca_reg.regs[ACA_REG_IDX__IPID]; |
| 172 | runtime->reg.reg_dump[RAS_CPER_ACA_REG_SYND] = trace->aca_reg.regs[ACA_REG_IDX__SYND]; |
| 173 | |
| 174 | return 0; |
| 175 | } |
| 176 | |
| 177 | static int cper_generate_runtime_record(struct ras_core_context *ras_core, |
| 178 | struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t arr_num, |
| 179 | enum ras_cper_severity sev) |
| 180 | { |
| 181 | struct cper_section_descriptor *descriptor; |
| 182 | struct cper_section_runtime *runtime; |
| 183 | int i; |
| 184 | |
| 185 | fill_section_hdr(ras_core, hdr, type: RAS_CPER_TYPE_RUNTIME, sev, trace: trace_arr[0]); |
| 186 | hdr->record_length = RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + RAS_NONSTD_SEC_LEN) * arr_num); |
| 187 | hdr->sec_cnt = arr_num; |
| 188 | for (i = 0; i < arr_num; i++) { |
| 189 | descriptor = (struct cper_section_descriptor *)((uint8_t *)hdr + |
| 190 | RAS_SEC_DESC_OFFSET(i)); |
| 191 | runtime = (struct cper_section_runtime *)((uint8_t *)hdr + |
| 192 | RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i)); |
| 193 | |
| 194 | fill_section_descriptor(ras_core, descriptor, sev, sec_type: RUNTIME, |
| 195 | RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i), |
| 196 | section_length: sizeof(struct cper_section_runtime)); |
| 197 | fill_section_runtime(ras_core, runtime, trace: trace_arr[i], sev); |
| 198 | } |
| 199 | |
| 200 | return 0; |
| 201 | } |
| 202 | |
| 203 | static int cper_generate_fatal_record(struct ras_core_context *ras_core, |
| 204 | uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num) |
| 205 | { |
| 206 | struct ras_cper_fatal_record record = {0}; |
| 207 | int i = 0; |
| 208 | |
| 209 | for (i = 0; i < arr_num; i++) { |
| 210 | fill_section_hdr(ras_core, hdr: &record.hdr, type: RAS_CPER_TYPE_FATAL, |
| 211 | sev: RAS_CPER_SEV_FATAL_UE, trace: trace_arr[i]); |
| 212 | record.hdr.record_length = RAS_HDR_LEN + RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN; |
| 213 | record.hdr.sec_cnt = 1; |
| 214 | |
| 215 | fill_section_descriptor(ras_core, descriptor: &record.descriptor, sev: RAS_CPER_SEV_FATAL_UE, |
| 216 | sec_type: CRASHDUMP, offsetof(struct ras_cper_fatal_record, fatal), |
| 217 | section_length: sizeof(struct cper_section_fatal)); |
| 218 | |
| 219 | fill_section_fatal(ras_core, fatal: &record.fatal, trace: trace_arr[i]); |
| 220 | |
| 221 | memcpy(buffer + (i * record.hdr.record_length), |
| 222 | &record, record.hdr.record_length); |
| 223 | } |
| 224 | |
| 225 | return 0; |
| 226 | } |
| 227 | |
| 228 | static int cper_get_record_size(enum ras_cper_type type, uint16_t section_count) |
| 229 | { |
| 230 | int size = 0; |
| 231 | |
| 232 | size += RAS_HDR_LEN; |
| 233 | size += (RAS_SEC_DESC_LEN * section_count); |
| 234 | |
| 235 | switch (type) { |
| 236 | case RAS_CPER_TYPE_RUNTIME: |
| 237 | case RAS_CPER_TYPE_RMA: |
| 238 | size += (RAS_NONSTD_SEC_LEN * section_count); |
| 239 | break; |
| 240 | case RAS_CPER_TYPE_FATAL: |
| 241 | size += (RAS_FATAL_SEC_LEN * section_count); |
| 242 | size += (RAS_HDR_LEN * (section_count - 1)); |
| 243 | break; |
| 244 | case RAS_CPER_TYPE_BOOT: |
| 245 | size += (RAS_BOOT_SEC_LEN * section_count); |
| 246 | break; |
| 247 | default: |
| 248 | /* should never reach here */ |
| 249 | break; |
| 250 | } |
| 251 | |
| 252 | return size; |
| 253 | } |
| 254 | |
| 255 | static enum ras_cper_type cper_ras_log_event_to_cper_type(enum ras_log_event event) |
| 256 | { |
| 257 | switch (event) { |
| 258 | case RAS_LOG_EVENT_UE: |
| 259 | return RAS_CPER_TYPE_FATAL; |
| 260 | case RAS_LOG_EVENT_DE: |
| 261 | case RAS_LOG_EVENT_CE: |
| 262 | case RAS_LOG_EVENT_POISON_CREATION: |
| 263 | case RAS_LOG_EVENT_POISON_CONSUMPTION: |
| 264 | return RAS_CPER_TYPE_RUNTIME; |
| 265 | case RAS_LOG_EVENT_RMA: |
| 266 | return RAS_CPER_TYPE_RMA; |
| 267 | default: |
| 268 | /* should never reach here */ |
| 269 | return RAS_CPER_TYPE_RUNTIME; |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | int ras_cper_generate_cper(struct ras_core_context *ras_core, |
| 274 | struct ras_log_info **trace_list, uint32_t count, |
| 275 | uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len) |
| 276 | { |
| 277 | uint8_t *buffer = buf; |
| 278 | uint64_t buf_size = buf_len; |
| 279 | int record_size, saved_size = 0; |
| 280 | struct cper_section_hdr *hdr; |
| 281 | |
| 282 | /* All the batch traces share the same event */ |
| 283 | record_size = cper_get_record_size( |
| 284 | type: cper_ras_log_event_to_cper_type(event: trace_list[0]->event), section_count: count); |
| 285 | |
| 286 | if ((record_size + saved_size) > buf_size) |
| 287 | return -ENOMEM; |
| 288 | |
| 289 | hdr = (struct cper_section_hdr *)(buffer + saved_size); |
| 290 | |
| 291 | switch (trace_list[0]->event) { |
| 292 | case RAS_LOG_EVENT_RMA: |
| 293 | cper_generate_runtime_record(ras_core, hdr, trace_arr: trace_list, arr_num: count, sev: RAS_CPER_SEV_RMA); |
| 294 | break; |
| 295 | case RAS_LOG_EVENT_DE: |
| 296 | cper_generate_runtime_record(ras_core, |
| 297 | hdr, trace_arr: trace_list, arr_num: count, sev: RAS_CPER_SEV_NON_FATAL_UE); |
| 298 | break; |
| 299 | case RAS_LOG_EVENT_CE: |
| 300 | cper_generate_runtime_record(ras_core, |
| 301 | hdr, trace_arr: trace_list, arr_num: count, sev: RAS_CPER_SEV_NON_FATAL_CE); |
| 302 | break; |
| 303 | case RAS_LOG_EVENT_UE: |
| 304 | cper_generate_fatal_record(ras_core, buffer: buffer + saved_size, trace_arr: trace_list, arr_num: count); |
| 305 | break; |
| 306 | default: |
| 307 | RAS_DEV_WARN(ras_core->dev, "Unprocessed trace event: %d\n" , trace_list[0]->event); |
| 308 | break; |
| 309 | } |
| 310 | |
| 311 | saved_size += record_size; |
| 312 | |
| 313 | *real_data_len = saved_size; |
| 314 | return 0; |
| 315 | } |
| 316 | |