| 1 | // SPDX-License-Identifier: MIT |
| 2 | /* |
| 3 | * Copyright 2024 Advanced Micro Devices, Inc. |
| 4 | * |
| 5 | * Permission is hereby granted, free of charge, to any person obtaining a |
| 6 | * copy of this software and associated documentation files (the "Software"), |
| 7 | * to deal in the Software without restriction, including without limitation |
| 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 9 | * and/or sell copies of the Software, and to permit persons to whom the |
| 10 | * Software is furnished to do so, subject to the following conditions: |
| 11 | * |
| 12 | * The above copyright notice and this permission notice shall be included in |
| 13 | * all copies or substantial portions of the Software. |
| 14 | * |
| 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| 18 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| 19 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| 20 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 21 | * OTHER DEALINGS IN THE SOFTWARE. |
| 22 | * |
| 23 | */ |
| 24 | |
| 25 | #include <generated/utsrelease.h> |
| 26 | #include <linux/devcoredump.h> |
| 27 | #include "amdgpu_dev_coredump.h" |
| 28 | #include "atom.h" |
| 29 | |
| 30 | #ifndef CONFIG_DEV_COREDUMP |
| 31 | void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, |
| 32 | bool vram_lost, struct amdgpu_job *job) |
| 33 | { |
| 34 | } |
| 35 | #else |
| 36 | |
| 37 | const char *hw_ip_names[MAX_HWIP] = { |
| 38 | [GC_HWIP] = "GC" , |
| 39 | [HDP_HWIP] = "HDP" , |
| 40 | [SDMA0_HWIP] = "SDMA0" , |
| 41 | [SDMA1_HWIP] = "SDMA1" , |
| 42 | [SDMA2_HWIP] = "SDMA2" , |
| 43 | [SDMA3_HWIP] = "SDMA3" , |
| 44 | [SDMA4_HWIP] = "SDMA4" , |
| 45 | [SDMA5_HWIP] = "SDMA5" , |
| 46 | [SDMA6_HWIP] = "SDMA6" , |
| 47 | [SDMA7_HWIP] = "SDMA7" , |
| 48 | [LSDMA_HWIP] = "LSDMA" , |
| 49 | [MMHUB_HWIP] = "MMHUB" , |
| 50 | [ATHUB_HWIP] = "ATHUB" , |
| 51 | [NBIO_HWIP] = "NBIO" , |
| 52 | [MP0_HWIP] = "MP0" , |
| 53 | [MP1_HWIP] = "MP1" , |
| 54 | [UVD_HWIP] = "UVD/JPEG/VCN" , |
| 55 | [VCN1_HWIP] = "VCN1" , |
| 56 | [VCE_HWIP] = "VCE" , |
| 57 | [VPE_HWIP] = "VPE" , |
| 58 | [DF_HWIP] = "DF" , |
| 59 | [DCE_HWIP] = "DCE" , |
| 60 | [OSSSYS_HWIP] = "OSSSYS" , |
| 61 | [SMUIO_HWIP] = "SMUIO" , |
| 62 | [PWR_HWIP] = "PWR" , |
| 63 | [NBIF_HWIP] = "NBIF" , |
| 64 | [THM_HWIP] = "THM" , |
| 65 | [CLK_HWIP] = "CLK" , |
| 66 | [UMC_HWIP] = "UMC" , |
| 67 | [RSMU_HWIP] = "RSMU" , |
| 68 | [XGMI_HWIP] = "XGMI" , |
| 69 | [DCI_HWIP] = "DCI" , |
| 70 | [PCIE_HWIP] = "PCIE" , |
| 71 | }; |
| 72 | |
| 73 | static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, |
| 74 | struct drm_printer *p) |
| 75 | { |
| 76 | uint32_t version; |
| 77 | uint32_t feature; |
| 78 | uint8_t smu_program, smu_major, smu_minor, smu_debug; |
| 79 | struct atom_context *ctx = adev->mode_info.atom_context; |
| 80 | |
| 81 | drm_printf(p, f: "VCE feature version: %u, fw version: 0x%08x\n" , |
| 82 | adev->vce.fb_version, adev->vce.fw_version); |
| 83 | drm_printf(p, f: "UVD feature version: %u, fw version: 0x%08x\n" , 0, |
| 84 | adev->uvd.fw_version); |
| 85 | drm_printf(p, f: "GMC feature version: %u, fw version: 0x%08x\n" , 0, |
| 86 | adev->gmc.fw_version); |
| 87 | drm_printf(p, f: "ME feature version: %u, fw version: 0x%08x\n" , |
| 88 | adev->gfx.me_feature_version, adev->gfx.me_fw_version); |
| 89 | drm_printf(p, f: "PFP feature version: %u, fw version: 0x%08x\n" , |
| 90 | adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version); |
| 91 | drm_printf(p, f: "CE feature version: %u, fw version: 0x%08x\n" , |
| 92 | adev->gfx.ce_feature_version, adev->gfx.ce_fw_version); |
| 93 | drm_printf(p, f: "RLC feature version: %u, fw version: 0x%08x\n" , |
| 94 | adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version); |
| 95 | |
| 96 | drm_printf(p, f: "RLC SRLC feature version: %u, fw version: 0x%08x\n" , |
| 97 | adev->gfx.rlc_srlc_feature_version, |
| 98 | adev->gfx.rlc_srlc_fw_version); |
| 99 | drm_printf(p, f: "RLC SRLG feature version: %u, fw version: 0x%08x\n" , |
| 100 | adev->gfx.rlc_srlg_feature_version, |
| 101 | adev->gfx.rlc_srlg_fw_version); |
| 102 | drm_printf(p, f: "RLC SRLS feature version: %u, fw version: 0x%08x\n" , |
| 103 | adev->gfx.rlc_srls_feature_version, |
| 104 | adev->gfx.rlc_srls_fw_version); |
| 105 | drm_printf(p, f: "RLCP feature version: %u, fw version: 0x%08x\n" , |
| 106 | adev->gfx.rlcp_ucode_feature_version, |
| 107 | adev->gfx.rlcp_ucode_version); |
| 108 | drm_printf(p, f: "RLCV feature version: %u, fw version: 0x%08x\n" , |
| 109 | adev->gfx.rlcv_ucode_feature_version, |
| 110 | adev->gfx.rlcv_ucode_version); |
| 111 | drm_printf(p, f: "MEC feature version: %u, fw version: 0x%08x\n" , |
| 112 | adev->gfx.mec_feature_version, adev->gfx.mec_fw_version); |
| 113 | |
| 114 | if (adev->gfx.mec2_fw) |
| 115 | drm_printf(p, f: "MEC2 feature version: %u, fw version: 0x%08x\n" , |
| 116 | adev->gfx.mec2_feature_version, |
| 117 | adev->gfx.mec2_fw_version); |
| 118 | |
| 119 | drm_printf(p, f: "IMU feature version: %u, fw version: 0x%08x\n" , 0, |
| 120 | adev->gfx.imu_fw_version); |
| 121 | drm_printf(p, f: "PSP SOS feature version: %u, fw version: 0x%08x\n" , |
| 122 | adev->psp.sos.feature_version, adev->psp.sos.fw_version); |
| 123 | drm_printf(p, f: "PSP ASD feature version: %u, fw version: 0x%08x\n" , |
| 124 | adev->psp.asd_context.bin_desc.feature_version, |
| 125 | adev->psp.asd_context.bin_desc.fw_version); |
| 126 | |
| 127 | drm_printf(p, f: "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n" , |
| 128 | adev->psp.xgmi_context.context.bin_desc.feature_version, |
| 129 | adev->psp.xgmi_context.context.bin_desc.fw_version); |
| 130 | drm_printf(p, f: "TA RAS feature version: 0x%08x, fw version: 0x%08x\n" , |
| 131 | adev->psp.ras_context.context.bin_desc.feature_version, |
| 132 | adev->psp.ras_context.context.bin_desc.fw_version); |
| 133 | drm_printf(p, f: "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n" , |
| 134 | adev->psp.hdcp_context.context.bin_desc.feature_version, |
| 135 | adev->psp.hdcp_context.context.bin_desc.fw_version); |
| 136 | drm_printf(p, f: "TA DTM feature version: 0x%08x, fw version: 0x%08x\n" , |
| 137 | adev->psp.dtm_context.context.bin_desc.feature_version, |
| 138 | adev->psp.dtm_context.context.bin_desc.fw_version); |
| 139 | drm_printf(p, f: "TA RAP feature version: 0x%08x, fw version: 0x%08x\n" , |
| 140 | adev->psp.rap_context.context.bin_desc.feature_version, |
| 141 | adev->psp.rap_context.context.bin_desc.fw_version); |
| 142 | drm_printf(p, |
| 143 | f: "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n" , |
| 144 | adev->psp.securedisplay_context.context.bin_desc.feature_version, |
| 145 | adev->psp.securedisplay_context.context.bin_desc.fw_version); |
| 146 | |
| 147 | /* SMC firmware */ |
| 148 | version = adev->pm.fw_version; |
| 149 | |
| 150 | smu_program = (version >> 24) & 0xff; |
| 151 | smu_major = (version >> 16) & 0xff; |
| 152 | smu_minor = (version >> 8) & 0xff; |
| 153 | smu_debug = (version >> 0) & 0xff; |
| 154 | drm_printf(p, |
| 155 | f: "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n" , |
| 156 | 0, smu_program, version, smu_major, smu_minor, smu_debug); |
| 157 | |
| 158 | /* SDMA firmware */ |
| 159 | for (int i = 0; i < adev->sdma.num_instances; i++) { |
| 160 | drm_printf(p, |
| 161 | f: "SDMA%d feature version: %u, firmware version: 0x%08x\n" , |
| 162 | i, adev->sdma.instance[i].feature_version, |
| 163 | adev->sdma.instance[i].fw_version); |
| 164 | } |
| 165 | |
| 166 | drm_printf(p, f: "VCN feature version: %u, fw version: 0x%08x\n" , 0, |
| 167 | adev->vcn.fw_version); |
| 168 | drm_printf(p, f: "DMCU feature version: %u, fw version: 0x%08x\n" , 0, |
| 169 | adev->dm.dmcu_fw_version); |
| 170 | drm_printf(p, f: "DMCUB feature version: %u, fw version: 0x%08x\n" , 0, |
| 171 | adev->dm.dmcub_fw_version); |
| 172 | drm_printf(p, f: "PSP TOC feature version: %u, fw version: 0x%08x\n" , |
| 173 | adev->psp.toc.feature_version, adev->psp.toc.fw_version); |
| 174 | |
| 175 | version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK; |
| 176 | feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >> |
| 177 | AMDGPU_MES_FEAT_VERSION_SHIFT; |
| 178 | drm_printf(p, f: "MES_KIQ feature version: %u, fw version: 0x%08x\n" , |
| 179 | feature, version); |
| 180 | |
| 181 | version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK; |
| 182 | feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >> |
| 183 | AMDGPU_MES_FEAT_VERSION_SHIFT; |
| 184 | drm_printf(p, f: "MES feature version: %u, fw version: 0x%08x\n" , feature, |
| 185 | version); |
| 186 | |
| 187 | drm_printf(p, f: "VPE feature version: %u, fw version: 0x%08x\n" , |
| 188 | adev->vpe.feature_version, adev->vpe.fw_version); |
| 189 | |
| 190 | drm_printf(p, f: "\nVBIOS Information\n" ); |
| 191 | drm_printf(p, f: "vbios name : %s\n" , ctx->name); |
| 192 | drm_printf(p, f: "vbios pn : %s\n" , ctx->vbios_pn); |
| 193 | drm_printf(p, f: "vbios version : %d\n" , ctx->version); |
| 194 | drm_printf(p, f: "vbios ver_str : %s\n" , ctx->vbios_ver_str); |
| 195 | drm_printf(p, f: "vbios date : %s\n" , ctx->date); |
| 196 | } |
| 197 | |
| 198 | static ssize_t |
| 199 | amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, |
| 200 | void *data, size_t datalen) |
| 201 | { |
| 202 | struct drm_printer p; |
| 203 | struct amdgpu_coredump_info *coredump = data; |
| 204 | struct drm_print_iterator iter; |
| 205 | struct amdgpu_vm_fault_info *fault_info; |
| 206 | struct amdgpu_ip_block *ip_block; |
| 207 | int ver; |
| 208 | |
| 209 | iter.data = buffer; |
| 210 | iter.offset = 0; |
| 211 | iter.start = offset; |
| 212 | iter.remain = count; |
| 213 | |
| 214 | p = drm_coredump_printer(iter: &iter); |
| 215 | |
| 216 | drm_printf(p: &p, f: "**** AMDGPU Device Coredump ****\n" ); |
| 217 | drm_printf(p: &p, f: "version: " AMDGPU_COREDUMP_VERSION "\n" ); |
| 218 | drm_printf(p: &p, f: "kernel: " UTS_RELEASE "\n" ); |
| 219 | drm_printf(p: &p, f: "module: " KBUILD_MODNAME "\n" ); |
| 220 | drm_printf(p: &p, f: "time: %ptSp\n" , &coredump->reset_time); |
| 221 | |
| 222 | if (coredump->reset_task_info.task.pid) |
| 223 | drm_printf(p: &p, f: "process_name: %s PID: %d\n" , |
| 224 | coredump->reset_task_info.process_name, |
| 225 | coredump->reset_task_info.task.pid); |
| 226 | |
| 227 | /* SOC Information */ |
| 228 | drm_printf(p: &p, f: "\nSOC Information\n" ); |
| 229 | drm_printf(p: &p, f: "SOC Device id: %d\n" , coredump->adev->pdev->device); |
| 230 | drm_printf(p: &p, f: "SOC PCI Revision id: %d\n" , coredump->adev->pdev->revision); |
| 231 | drm_printf(p: &p, f: "SOC Family: %d\n" , coredump->adev->family); |
| 232 | drm_printf(p: &p, f: "SOC Revision id: %d\n" , coredump->adev->rev_id); |
| 233 | drm_printf(p: &p, f: "SOC External Revision id: %d\n" , coredump->adev->external_rev_id); |
| 234 | |
| 235 | /* Memory Information */ |
| 236 | drm_printf(p: &p, f: "\nSOC Memory Information\n" ); |
| 237 | drm_printf(p: &p, f: "real vram size: %llu\n" , coredump->adev->gmc.real_vram_size); |
| 238 | drm_printf(p: &p, f: "visible vram size: %llu\n" , coredump->adev->gmc.visible_vram_size); |
| 239 | drm_printf(p: &p, f: "gtt size: %llu\n" , coredump->adev->mman.gtt_mgr.manager.size); |
| 240 | |
| 241 | /* GDS Config */ |
| 242 | drm_printf(p: &p, f: "\nGDS Config\n" ); |
| 243 | drm_printf(p: &p, f: "gds: total size: %d\n" , coredump->adev->gds.gds_size); |
| 244 | drm_printf(p: &p, f: "gds: compute partition size: %d\n" , coredump->adev->gds.gds_size); |
| 245 | drm_printf(p: &p, f: "gds: gws per compute partition: %d\n" , coredump->adev->gds.gws_size); |
| 246 | drm_printf(p: &p, f: "gds: os per compute partition: %d\n" , coredump->adev->gds.oa_size); |
| 247 | |
| 248 | /* HWIP Version Information */ |
| 249 | drm_printf(p: &p, f: "\nHW IP Version Information\n" ); |
| 250 | for (int i = 1; i < MAX_HWIP; i++) { |
| 251 | for (int j = 0; j < HWIP_MAX_INSTANCE; j++) { |
| 252 | ver = coredump->adev->ip_versions[i][j]; |
| 253 | if (ver) |
| 254 | drm_printf(p: &p, f: "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n" , |
| 255 | hw_ip_names[i], i, j, |
| 256 | IP_VERSION_MAJ(ver), |
| 257 | IP_VERSION_MIN(ver), |
| 258 | IP_VERSION_REV(ver), |
| 259 | IP_VERSION_VARIANT(ver), |
| 260 | IP_VERSION_SUBREV(ver)); |
| 261 | } |
| 262 | } |
| 263 | |
| 264 | /* IP firmware information */ |
| 265 | drm_printf(p: &p, f: "\nIP Firmwares\n" ); |
| 266 | amdgpu_devcoredump_fw_info(adev: coredump->adev, p: &p); |
| 267 | |
| 268 | if (coredump->ring) { |
| 269 | drm_printf(p: &p, f: "\nRing timed out details\n" ); |
| 270 | drm_printf(p: &p, f: "IP Type: %d Ring Name: %s\n" , |
| 271 | coredump->ring->funcs->type, |
| 272 | coredump->ring->name); |
| 273 | } |
| 274 | |
| 275 | /* Add page fault information */ |
| 276 | fault_info = &coredump->adev->vm_manager.fault_info; |
| 277 | drm_printf(p: &p, f: "\n[%s] Page fault observed\n" , |
| 278 | fault_info->vmhub ? "mmhub" : "gfxhub" ); |
| 279 | drm_printf(p: &p, f: "Faulty page starting at address: 0x%016llx\n" , fault_info->addr); |
| 280 | drm_printf(p: &p, f: "Protection fault status register: 0x%x\n\n" , fault_info->status); |
| 281 | |
| 282 | /* dump the ip state for each ip */ |
| 283 | drm_printf(p: &p, f: "IP Dump\n" ); |
| 284 | for (int i = 0; i < coredump->adev->num_ip_blocks; i++) { |
| 285 | ip_block = &coredump->adev->ip_blocks[i]; |
| 286 | if (ip_block->version->funcs->print_ip_state) { |
| 287 | drm_printf(p: &p, f: "IP: %s\n" , ip_block->version->funcs->name); |
| 288 | ip_block->version->funcs->print_ip_state(ip_block, &p); |
| 289 | drm_printf(p: &p, f: "\n" ); |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | /* Add ring buffer information */ |
| 294 | drm_printf(p: &p, f: "Ring buffer information\n" ); |
| 295 | for (int i = 0; i < coredump->adev->num_rings; i++) { |
| 296 | int j = 0; |
| 297 | struct amdgpu_ring *ring = coredump->adev->rings[i]; |
| 298 | |
| 299 | drm_printf(p: &p, f: "ring name: %s\n" , ring->name); |
| 300 | drm_printf(p: &p, f: "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n" , |
| 301 | amdgpu_ring_get_rptr(ring), |
| 302 | amdgpu_ring_get_wptr(ring), |
| 303 | ring->buf_mask); |
| 304 | drm_printf(p: &p, f: "Ring size in dwords: %d\n" , |
| 305 | ring->ring_size / 4); |
| 306 | drm_printf(p: &p, f: "Ring contents\n" ); |
| 307 | drm_printf(p: &p, f: "Offset \t Value\n" ); |
| 308 | |
| 309 | while (j < ring->ring_size) { |
| 310 | drm_printf(p: &p, f: "0x%x \t 0x%x\n" , j, ring->ring[j / 4]); |
| 311 | j += 4; |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | if (coredump->skip_vram_check) |
| 316 | drm_printf(p: &p, f: "VRAM lost check is skipped!\n" ); |
| 317 | else if (coredump->reset_vram_lost) |
| 318 | drm_printf(p: &p, f: "VRAM is lost due to GPU reset!\n" ); |
| 319 | |
| 320 | return count - iter.remain; |
| 321 | } |
| 322 | |
| 323 | static void amdgpu_devcoredump_free(void *data) |
| 324 | { |
| 325 | kfree(objp: data); |
| 326 | } |
| 327 | |
| 328 | void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, |
| 329 | bool vram_lost, struct amdgpu_job *job) |
| 330 | { |
| 331 | struct drm_device *dev = adev_to_drm(adev); |
| 332 | struct amdgpu_coredump_info *coredump; |
| 333 | struct drm_sched_job *s_job; |
| 334 | |
| 335 | coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); |
| 336 | |
| 337 | if (!coredump) { |
| 338 | DRM_ERROR("%s: failed to allocate memory for coredump\n" , __func__); |
| 339 | return; |
| 340 | } |
| 341 | |
| 342 | coredump->skip_vram_check = skip_vram_check; |
| 343 | coredump->reset_vram_lost = vram_lost; |
| 344 | |
| 345 | if (job && job->pasid) { |
| 346 | struct amdgpu_task_info *ti; |
| 347 | |
| 348 | ti = amdgpu_vm_get_task_info_pasid(adev, pasid: job->pasid); |
| 349 | if (ti) { |
| 350 | coredump->reset_task_info = *ti; |
| 351 | amdgpu_vm_put_task_info(task_info: ti); |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | if (job) { |
| 356 | s_job = &job->base; |
| 357 | coredump->ring = to_amdgpu_ring(s_job->sched); |
| 358 | } |
| 359 | |
| 360 | coredump->adev = adev; |
| 361 | |
| 362 | ktime_get_ts64(ts: &coredump->reset_time); |
| 363 | |
| 364 | dev_coredumpm(dev: dev->dev, THIS_MODULE, data: coredump, datalen: 0, GFP_NOWAIT, |
| 365 | read: amdgpu_devcoredump_read, free: amdgpu_devcoredump_free); |
| 366 | |
| 367 | drm_info(dev, "AMDGPU device coredump file has been created\n" ); |
| 368 | drm_info(dev, "Check your /sys/class/drm/card%d/device/devcoredump/data\n" , |
| 369 | dev->primary->index); |
| 370 | } |
| 371 | #endif |
| 372 | |