1// SPDX-License-Identifier: MIT
2/*
3 * Copyright 2024 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25#include <generated/utsrelease.h>
26#include <linux/devcoredump.h>
27#include "amdgpu_dev_coredump.h"
28#include "atom.h"
29
30#ifndef CONFIG_DEV_COREDUMP
31void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
32 bool vram_lost, struct amdgpu_job *job)
33{
34}
35#else
36
37const char *hw_ip_names[MAX_HWIP] = {
38 [GC_HWIP] = "GC",
39 [HDP_HWIP] = "HDP",
40 [SDMA0_HWIP] = "SDMA0",
41 [SDMA1_HWIP] = "SDMA1",
42 [SDMA2_HWIP] = "SDMA2",
43 [SDMA3_HWIP] = "SDMA3",
44 [SDMA4_HWIP] = "SDMA4",
45 [SDMA5_HWIP] = "SDMA5",
46 [SDMA6_HWIP] = "SDMA6",
47 [SDMA7_HWIP] = "SDMA7",
48 [LSDMA_HWIP] = "LSDMA",
49 [MMHUB_HWIP] = "MMHUB",
50 [ATHUB_HWIP] = "ATHUB",
51 [NBIO_HWIP] = "NBIO",
52 [MP0_HWIP] = "MP0",
53 [MP1_HWIP] = "MP1",
54 [UVD_HWIP] = "UVD/JPEG/VCN",
55 [VCN1_HWIP] = "VCN1",
56 [VCE_HWIP] = "VCE",
57 [VPE_HWIP] = "VPE",
58 [DF_HWIP] = "DF",
59 [DCE_HWIP] = "DCE",
60 [OSSSYS_HWIP] = "OSSSYS",
61 [SMUIO_HWIP] = "SMUIO",
62 [PWR_HWIP] = "PWR",
63 [NBIF_HWIP] = "NBIF",
64 [THM_HWIP] = "THM",
65 [CLK_HWIP] = "CLK",
66 [UMC_HWIP] = "UMC",
67 [RSMU_HWIP] = "RSMU",
68 [XGMI_HWIP] = "XGMI",
69 [DCI_HWIP] = "DCI",
70 [PCIE_HWIP] = "PCIE",
71};
72
73static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
74 struct drm_printer *p)
75{
76 uint32_t version;
77 uint32_t feature;
78 uint8_t smu_program, smu_major, smu_minor, smu_debug;
79 struct atom_context *ctx = adev->mode_info.atom_context;
80
81 drm_printf(p, f: "VCE feature version: %u, fw version: 0x%08x\n",
82 adev->vce.fb_version, adev->vce.fw_version);
83 drm_printf(p, f: "UVD feature version: %u, fw version: 0x%08x\n", 0,
84 adev->uvd.fw_version);
85 drm_printf(p, f: "GMC feature version: %u, fw version: 0x%08x\n", 0,
86 adev->gmc.fw_version);
87 drm_printf(p, f: "ME feature version: %u, fw version: 0x%08x\n",
88 adev->gfx.me_feature_version, adev->gfx.me_fw_version);
89 drm_printf(p, f: "PFP feature version: %u, fw version: 0x%08x\n",
90 adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
91 drm_printf(p, f: "CE feature version: %u, fw version: 0x%08x\n",
92 adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
93 drm_printf(p, f: "RLC feature version: %u, fw version: 0x%08x\n",
94 adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
95
96 drm_printf(p, f: "RLC SRLC feature version: %u, fw version: 0x%08x\n",
97 adev->gfx.rlc_srlc_feature_version,
98 adev->gfx.rlc_srlc_fw_version);
99 drm_printf(p, f: "RLC SRLG feature version: %u, fw version: 0x%08x\n",
100 adev->gfx.rlc_srlg_feature_version,
101 adev->gfx.rlc_srlg_fw_version);
102 drm_printf(p, f: "RLC SRLS feature version: %u, fw version: 0x%08x\n",
103 adev->gfx.rlc_srls_feature_version,
104 adev->gfx.rlc_srls_fw_version);
105 drm_printf(p, f: "RLCP feature version: %u, fw version: 0x%08x\n",
106 adev->gfx.rlcp_ucode_feature_version,
107 adev->gfx.rlcp_ucode_version);
108 drm_printf(p, f: "RLCV feature version: %u, fw version: 0x%08x\n",
109 adev->gfx.rlcv_ucode_feature_version,
110 adev->gfx.rlcv_ucode_version);
111 drm_printf(p, f: "MEC feature version: %u, fw version: 0x%08x\n",
112 adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
113
114 if (adev->gfx.mec2_fw)
115 drm_printf(p, f: "MEC2 feature version: %u, fw version: 0x%08x\n",
116 adev->gfx.mec2_feature_version,
117 adev->gfx.mec2_fw_version);
118
119 drm_printf(p, f: "IMU feature version: %u, fw version: 0x%08x\n", 0,
120 adev->gfx.imu_fw_version);
121 drm_printf(p, f: "PSP SOS feature version: %u, fw version: 0x%08x\n",
122 adev->psp.sos.feature_version, adev->psp.sos.fw_version);
123 drm_printf(p, f: "PSP ASD feature version: %u, fw version: 0x%08x\n",
124 adev->psp.asd_context.bin_desc.feature_version,
125 adev->psp.asd_context.bin_desc.fw_version);
126
127 drm_printf(p, f: "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
128 adev->psp.xgmi_context.context.bin_desc.feature_version,
129 adev->psp.xgmi_context.context.bin_desc.fw_version);
130 drm_printf(p, f: "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
131 adev->psp.ras_context.context.bin_desc.feature_version,
132 adev->psp.ras_context.context.bin_desc.fw_version);
133 drm_printf(p, f: "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
134 adev->psp.hdcp_context.context.bin_desc.feature_version,
135 adev->psp.hdcp_context.context.bin_desc.fw_version);
136 drm_printf(p, f: "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
137 adev->psp.dtm_context.context.bin_desc.feature_version,
138 adev->psp.dtm_context.context.bin_desc.fw_version);
139 drm_printf(p, f: "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
140 adev->psp.rap_context.context.bin_desc.feature_version,
141 adev->psp.rap_context.context.bin_desc.fw_version);
142 drm_printf(p,
143 f: "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
144 adev->psp.securedisplay_context.context.bin_desc.feature_version,
145 adev->psp.securedisplay_context.context.bin_desc.fw_version);
146
147 /* SMC firmware */
148 version = adev->pm.fw_version;
149
150 smu_program = (version >> 24) & 0xff;
151 smu_major = (version >> 16) & 0xff;
152 smu_minor = (version >> 8) & 0xff;
153 smu_debug = (version >> 0) & 0xff;
154 drm_printf(p,
155 f: "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
156 0, smu_program, version, smu_major, smu_minor, smu_debug);
157
158 /* SDMA firmware */
159 for (int i = 0; i < adev->sdma.num_instances; i++) {
160 drm_printf(p,
161 f: "SDMA%d feature version: %u, firmware version: 0x%08x\n",
162 i, adev->sdma.instance[i].feature_version,
163 adev->sdma.instance[i].fw_version);
164 }
165
166 drm_printf(p, f: "VCN feature version: %u, fw version: 0x%08x\n", 0,
167 adev->vcn.fw_version);
168 drm_printf(p, f: "DMCU feature version: %u, fw version: 0x%08x\n", 0,
169 adev->dm.dmcu_fw_version);
170 drm_printf(p, f: "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
171 adev->dm.dmcub_fw_version);
172 drm_printf(p, f: "PSP TOC feature version: %u, fw version: 0x%08x\n",
173 adev->psp.toc.feature_version, adev->psp.toc.fw_version);
174
175 version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
176 feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
177 AMDGPU_MES_FEAT_VERSION_SHIFT;
178 drm_printf(p, f: "MES_KIQ feature version: %u, fw version: 0x%08x\n",
179 feature, version);
180
181 version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
182 feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
183 AMDGPU_MES_FEAT_VERSION_SHIFT;
184 drm_printf(p, f: "MES feature version: %u, fw version: 0x%08x\n", feature,
185 version);
186
187 drm_printf(p, f: "VPE feature version: %u, fw version: 0x%08x\n",
188 adev->vpe.feature_version, adev->vpe.fw_version);
189
190 drm_printf(p, f: "\nVBIOS Information\n");
191 drm_printf(p, f: "vbios name : %s\n", ctx->name);
192 drm_printf(p, f: "vbios pn : %s\n", ctx->vbios_pn);
193 drm_printf(p, f: "vbios version : %d\n", ctx->version);
194 drm_printf(p, f: "vbios ver_str : %s\n", ctx->vbios_ver_str);
195 drm_printf(p, f: "vbios date : %s\n", ctx->date);
196}
197
198static ssize_t
199amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
200 void *data, size_t datalen)
201{
202 struct drm_printer p;
203 struct amdgpu_coredump_info *coredump = data;
204 struct drm_print_iterator iter;
205 struct amdgpu_vm_fault_info *fault_info;
206 struct amdgpu_ip_block *ip_block;
207 int ver;
208
209 iter.data = buffer;
210 iter.offset = 0;
211 iter.start = offset;
212 iter.remain = count;
213
214 p = drm_coredump_printer(iter: &iter);
215
216 drm_printf(p: &p, f: "**** AMDGPU Device Coredump ****\n");
217 drm_printf(p: &p, f: "version: " AMDGPU_COREDUMP_VERSION "\n");
218 drm_printf(p: &p, f: "kernel: " UTS_RELEASE "\n");
219 drm_printf(p: &p, f: "module: " KBUILD_MODNAME "\n");
220 drm_printf(p: &p, f: "time: %ptSp\n", &coredump->reset_time);
221
222 if (coredump->reset_task_info.task.pid)
223 drm_printf(p: &p, f: "process_name: %s PID: %d\n",
224 coredump->reset_task_info.process_name,
225 coredump->reset_task_info.task.pid);
226
227 /* SOC Information */
228 drm_printf(p: &p, f: "\nSOC Information\n");
229 drm_printf(p: &p, f: "SOC Device id: %d\n", coredump->adev->pdev->device);
230 drm_printf(p: &p, f: "SOC PCI Revision id: %d\n", coredump->adev->pdev->revision);
231 drm_printf(p: &p, f: "SOC Family: %d\n", coredump->adev->family);
232 drm_printf(p: &p, f: "SOC Revision id: %d\n", coredump->adev->rev_id);
233 drm_printf(p: &p, f: "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
234
235 /* Memory Information */
236 drm_printf(p: &p, f: "\nSOC Memory Information\n");
237 drm_printf(p: &p, f: "real vram size: %llu\n", coredump->adev->gmc.real_vram_size);
238 drm_printf(p: &p, f: "visible vram size: %llu\n", coredump->adev->gmc.visible_vram_size);
239 drm_printf(p: &p, f: "gtt size: %llu\n", coredump->adev->mman.gtt_mgr.manager.size);
240
241 /* GDS Config */
242 drm_printf(p: &p, f: "\nGDS Config\n");
243 drm_printf(p: &p, f: "gds: total size: %d\n", coredump->adev->gds.gds_size);
244 drm_printf(p: &p, f: "gds: compute partition size: %d\n", coredump->adev->gds.gds_size);
245 drm_printf(p: &p, f: "gds: gws per compute partition: %d\n", coredump->adev->gds.gws_size);
246 drm_printf(p: &p, f: "gds: os per compute partition: %d\n", coredump->adev->gds.oa_size);
247
248 /* HWIP Version Information */
249 drm_printf(p: &p, f: "\nHW IP Version Information\n");
250 for (int i = 1; i < MAX_HWIP; i++) {
251 for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
252 ver = coredump->adev->ip_versions[i][j];
253 if (ver)
254 drm_printf(p: &p, f: "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
255 hw_ip_names[i], i, j,
256 IP_VERSION_MAJ(ver),
257 IP_VERSION_MIN(ver),
258 IP_VERSION_REV(ver),
259 IP_VERSION_VARIANT(ver),
260 IP_VERSION_SUBREV(ver));
261 }
262 }
263
264 /* IP firmware information */
265 drm_printf(p: &p, f: "\nIP Firmwares\n");
266 amdgpu_devcoredump_fw_info(adev: coredump->adev, p: &p);
267
268 if (coredump->ring) {
269 drm_printf(p: &p, f: "\nRing timed out details\n");
270 drm_printf(p: &p, f: "IP Type: %d Ring Name: %s\n",
271 coredump->ring->funcs->type,
272 coredump->ring->name);
273 }
274
275 /* Add page fault information */
276 fault_info = &coredump->adev->vm_manager.fault_info;
277 drm_printf(p: &p, f: "\n[%s] Page fault observed\n",
278 fault_info->vmhub ? "mmhub" : "gfxhub");
279 drm_printf(p: &p, f: "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
280 drm_printf(p: &p, f: "Protection fault status register: 0x%x\n\n", fault_info->status);
281
282 /* dump the ip state for each ip */
283 drm_printf(p: &p, f: "IP Dump\n");
284 for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
285 ip_block = &coredump->adev->ip_blocks[i];
286 if (ip_block->version->funcs->print_ip_state) {
287 drm_printf(p: &p, f: "IP: %s\n", ip_block->version->funcs->name);
288 ip_block->version->funcs->print_ip_state(ip_block, &p);
289 drm_printf(p: &p, f: "\n");
290 }
291 }
292
293 /* Add ring buffer information */
294 drm_printf(p: &p, f: "Ring buffer information\n");
295 for (int i = 0; i < coredump->adev->num_rings; i++) {
296 int j = 0;
297 struct amdgpu_ring *ring = coredump->adev->rings[i];
298
299 drm_printf(p: &p, f: "ring name: %s\n", ring->name);
300 drm_printf(p: &p, f: "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
301 amdgpu_ring_get_rptr(ring),
302 amdgpu_ring_get_wptr(ring),
303 ring->buf_mask);
304 drm_printf(p: &p, f: "Ring size in dwords: %d\n",
305 ring->ring_size / 4);
306 drm_printf(p: &p, f: "Ring contents\n");
307 drm_printf(p: &p, f: "Offset \t Value\n");
308
309 while (j < ring->ring_size) {
310 drm_printf(p: &p, f: "0x%x \t 0x%x\n", j, ring->ring[j / 4]);
311 j += 4;
312 }
313 }
314
315 if (coredump->skip_vram_check)
316 drm_printf(p: &p, f: "VRAM lost check is skipped!\n");
317 else if (coredump->reset_vram_lost)
318 drm_printf(p: &p, f: "VRAM is lost due to GPU reset!\n");
319
320 return count - iter.remain;
321}
322
323static void amdgpu_devcoredump_free(void *data)
324{
325 kfree(objp: data);
326}
327
328void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
329 bool vram_lost, struct amdgpu_job *job)
330{
331 struct drm_device *dev = adev_to_drm(adev);
332 struct amdgpu_coredump_info *coredump;
333 struct drm_sched_job *s_job;
334
335 coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
336
337 if (!coredump) {
338 DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
339 return;
340 }
341
342 coredump->skip_vram_check = skip_vram_check;
343 coredump->reset_vram_lost = vram_lost;
344
345 if (job && job->pasid) {
346 struct amdgpu_task_info *ti;
347
348 ti = amdgpu_vm_get_task_info_pasid(adev, pasid: job->pasid);
349 if (ti) {
350 coredump->reset_task_info = *ti;
351 amdgpu_vm_put_task_info(task_info: ti);
352 }
353 }
354
355 if (job) {
356 s_job = &job->base;
357 coredump->ring = to_amdgpu_ring(s_job->sched);
358 }
359
360 coredump->adev = adev;
361
362 ktime_get_ts64(ts: &coredump->reset_time);
363
364 dev_coredumpm(dev: dev->dev, THIS_MODULE, data: coredump, datalen: 0, GFP_NOWAIT,
365 read: amdgpu_devcoredump_read, free: amdgpu_devcoredump_free);
366
367 drm_info(dev, "AMDGPU device coredump file has been created\n");
368 drm_info(dev, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
369 dev->primary->index);
370}
371#endif
372

source code of linux/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c