umc_v12_0.c source code [linux/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c]

1	/*
2	* Copyright 2023 Advanced Micro Devices, Inc.
3	*
4	* Permission is hereby granted, free of charge, to any person obtaining a
5	* copy of this software and associated documentation files (the "Software"),
6	* to deal in the Software without restriction, including without limitation
7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8	* and/or sell copies of the Software, and to permit persons to whom the
9	* Software is furnished to do so, subject to the following conditions:
10	*
11	* The above copyright notice and this permission notice shall be included in
12	* all copies or substantial portions of the Software.
13	*
14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17	* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20	* OTHER DEALINGS IN THE SOFTWARE.
21	*
22	*/
23	#include "umc_v12_0.h"
24	#include "amdgpu_ras.h"
25	#include "amdgpu_umc.h"
26	#include "amdgpu.h"
27	#include "umc/umc_12_0_0_offset.h"
28	#include "umc/umc_12_0_0_sh_mask.h"
29	#include "mp/mp_13_0_6_sh_mask.h"
30
31	#define MAX_ECC_NUM_PER_RETIREMENT 32
32	#define DELAYED_TIME_FOR_GPU_RESET 1000 //ms
33
34	static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
35	uint32_t node_inst,
36	uint32_t umc_inst,
37	uint32_t ch_inst)
38	{
39	uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
40	uint64_t cross_node_offset = (node_inst == `0`) ? `0` : UMC_V12_0_CROSS_NODE_OFFSET;
41
42	umc_inst = index / `4`;
43	ch_inst = index % `4`;
44
45	return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST * umc_inst +
46	UMC_V12_0_NODE_DIST * node_inst + cross_node_offset;
47	}
48
49	static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device *adev,
50	uint32_t node_inst, uint32_t umc_inst,
51	uint32_t ch_inst, void *data)
52	{
53	uint64_t odecc_err_cnt_addr;
54	uint64_t umc_reg_offset =
55	get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
56
57	odecc_err_cnt_addr =
58	SOC15_REG_OFFSET(UMC, `0`, regUMCCH0_OdEccErrCnt);
59
60	/ clear error count /
61	WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * `4`,
62	UMC_V12_0_CE_CNT_INIT);
63
64	return `0`;
65	}
66
67	static void umc_v12_0_reset_error_count(struct amdgpu_device *adev)
68	{
69	amdgpu_umc_loop_channels(adev,
70	func: umc_v12_0_reset_error_count_per_channel, NULL);
71	}
72
73	bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
74	{
75	dev_dbg(adev->dev,
76	"MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, PCC:%llu, UC:%llu, TCC:%llu\n",
77	mc_umc_status,
78	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
79	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Poison),
80	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred),
81	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
82	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
83	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
84	);
85
86	return (amdgpu_ras_is_poison_mode_supported(adev) &&
87	(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == `1`) &&
88	((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == `1`) \|\|
89	(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Poison) == `1`)));
90	}
91
92	bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
93	{
94	if (umc_v12_0_is_deferred_error(adev, mc_umc_status))
95	return false;
96
97	return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == `1`) &&
98	(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == `1` \|\|
99	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == `1` \|\|
100	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == `1`));
101	}
102
103	bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
104	{
105	if (umc_v12_0_is_deferred_error(adev, mc_umc_status))
106	return false;
107
108	return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == `1` &&
109	(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == `1` \|\|
110	(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == `1` &&
111	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == `0`) \|\|
112	/ Identify data parity error in replay mode /
113	((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == `0x5` \|\|
114	REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == `0xb`) &&
115	!(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)))));
116	}
117
118	static void umc_v12_0_query_error_count_per_type(struct amdgpu_device *adev,
119	uint64_t umc_reg_offset,
120	unsigned long *error_count,
121	check_error_type_func error_type_func)
122	{
123	uint64_t mc_umc_status;
124	uint64_t mc_umc_status_addr;
125
126	mc_umc_status_addr =
127	SOC15_REG_OFFSET(UMC, `0`, regMCA_UMC_UMC0_MCUMC_STATUST0);
128
129	/ Check MCUMC_STATUS /
130	mc_umc_status =
131	RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * `4`);
132
133	if (error_type_func(adev, mc_umc_status))
134	*error_count += `1`;
135	}
136
137	static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
138	uint32_t node_inst, uint32_t umc_inst,
139	uint32_t ch_inst, void *data)
140	{
141	struct ras_err_data err_data = (struct* ras_err_data *)data;
142	unsigned long ue_count = `0`, ce_count = `0`, de_count = `0`;
143
144	/ NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3],*
145	* which can be used as die ID directly */
146	struct amdgpu_smuio_mcm_config_info mcm_info = {
147	.socket_id = adev->smuio.funcs->get_socket_id(adev),
148	.die_id = node_inst,
149	};
150
151	uint64_t umc_reg_offset =
152	get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
153
154	umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
155	error_count: &ce_count, error_type_func: umc_v12_0_is_correctable_error);
156	umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
157	error_count: &ue_count, error_type_func: umc_v12_0_is_uncorrectable_error);
158	umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
159	error_count: &de_count, error_type_func: umc_v12_0_is_deferred_error);
160
161	amdgpu_ras_error_statistic_ue_count(err_data, mcm_info: &mcm_info, count: ue_count);
162	amdgpu_ras_error_statistic_ce_count(err_data, mcm_info: &mcm_info, count: ce_count);
163	amdgpu_ras_error_statistic_de_count(err_data, mcm_info: &mcm_info, count: de_count);
164
165	return `0`;
166	}
167
168	static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,
169	void *ras_error_status)
170	{
171	amdgpu_umc_loop_channels(adev,
172	func: umc_v12_0_query_error_count, data: ras_error_status);
173
174	umc_v12_0_reset_error_count(adev);
175	}
176
177	static void umc_v12_0_get_retire_flip_bits(struct amdgpu_device *adev)
178	{
179	enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
180	uint32_t vram_type = adev->gmc.vram_type;
181	struct amdgpu_umc_flip_bits *flip_bits = &(adev->umc.flip_bits);
182
183	if (adev->gmc.gmc_funcs->query_mem_partition_mode)
184	nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
185
186	/ default setting /
187	flip_bits->flip_bits_in_pa[`0`] = UMC_V12_0_PA_C2_BIT;
188	flip_bits->flip_bits_in_pa[`1`] = UMC_V12_0_PA_C3_BIT;
189	flip_bits->flip_bits_in_pa[`2`] = UMC_V12_0_PA_C4_BIT;
190	flip_bits->flip_bits_in_pa[`3`] = UMC_V12_0_PA_R13_BIT;
191	flip_bits->flip_row_bit = `13`;
192	flip_bits->bit_num = `4`;
193	flip_bits->r13_in_pa = UMC_V12_0_PA_R13_BIT;
194
195	if (nps == AMDGPU_NPS2_PARTITION_MODE) {
196	flip_bits->flip_bits_in_pa[`0`] = UMC_V12_0_PA_CH5_BIT;
197	flip_bits->flip_bits_in_pa[`1`] = UMC_V12_0_PA_C2_BIT;
198	flip_bits->flip_bits_in_pa[`2`] = UMC_V12_0_PA_B1_BIT;
199	flip_bits->r13_in_pa = UMC_V12_0_PA_R12_BIT;
200	} else if (nps == AMDGPU_NPS4_PARTITION_MODE) {
201	flip_bits->flip_bits_in_pa[`0`] = UMC_V12_0_PA_CH4_BIT;
202	flip_bits->flip_bits_in_pa[`1`] = UMC_V12_0_PA_CH5_BIT;
203	flip_bits->flip_bits_in_pa[`2`] = UMC_V12_0_PA_B0_BIT;
204	flip_bits->r13_in_pa = UMC_V12_0_PA_R11_BIT;
205	}
206
207	switch (vram_type) {
208	case AMDGPU_VRAM_TYPE_HBM:
209	/ other nps modes are taken as nps1 /
210	if (nps == AMDGPU_NPS2_PARTITION_MODE)
211	flip_bits->flip_bits_in_pa[`3`] = UMC_V12_0_PA_R12_BIT;
212	else if (nps == AMDGPU_NPS4_PARTITION_MODE)
213	flip_bits->flip_bits_in_pa[`3`] = UMC_V12_0_PA_R11_BIT;
214
215	break;
216	case AMDGPU_VRAM_TYPE_HBM3E:
217	flip_bits->flip_bits_in_pa[`3`] = UMC_V12_0_PA_R12_BIT;
218	flip_bits->flip_row_bit = `12`;
219
220	if (nps == AMDGPU_NPS2_PARTITION_MODE)
221	flip_bits->flip_bits_in_pa[`3`] = UMC_V12_0_PA_R11_BIT;
222	else if (nps == AMDGPU_NPS4_PARTITION_MODE)
223	flip_bits->flip_bits_in_pa[`3`] = UMC_V12_0_PA_R10_BIT;
224
225	break;
226	default:
227	dev_warn(adev->dev,
228	"Unknown HBM type, set RAS retire flip bits to the value in NPS1 mode.\n");
229	break;
230	}
231
232	adev->umc.retire_unit = `0x1` << flip_bits->bit_num;
233	}
234
235	static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
236	struct ras_err_data *err_data,
237	struct ta_ras_query_address_input *addr_in,
238	struct ta_ras_query_address_output *addr_out,
239	bool dump_addr)
240	{
241	uint32_t col, col_lower, row, row_lower, row_high, bank;
242	uint32_t channel_index = `0`, umc_inst = `0`;
243	uint32_t i, bit_num, retire_unit, *flip_bits;
244	uint64_t soc_pa, column, err_addr;
245	struct ta_ras_query_address_output addr_out_tmp;
246	struct ta_ras_query_address_output *paddr_out;
247	int ret = `0`;
248
249	if (!addr_out)
250	paddr_out = &addr_out_tmp;
251	else
252	paddr_out = addr_out;
253
254	err_addr = bank = `0`;
255	if (addr_in) {
256	err_addr = addr_in->ma.err_addr;
257	addr_in->addr_type = TA_RAS_MCA_TO_PA;
258	ret = psp_ras_query_address(psp: &adev->psp, addr_in, addr_out: paddr_out);
259	if (ret) {
260	dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
261	err_addr);
262
263	goto out;
264	}
265
266	bank = paddr_out->pa.bank;
267	/ no need to care about umc inst if addr_in is NULL /
268	umc_inst = addr_in->ma.umc_inst;
269	}
270
271	flip_bits = adev->umc.flip_bits.flip_bits_in_pa;
272	bit_num = adev->umc.flip_bits.bit_num;
273	retire_unit = adev->umc.retire_unit;
274
275	soc_pa = paddr_out->pa.pa;
276	channel_index = paddr_out->pa.channel_idx;
277	/ clear loop bits in soc physical address /
278	for (i = `0`; i < bit_num; i++)
279	soc_pa &= ~BIT_ULL(flip_bits[i]);
280
281	paddr_out->pa.pa = soc_pa;
282	/ get column bit 0 and 1 in mca address /
283	col_lower = (err_addr >> `1`) & `0x3ULL`;
284	/ extra row bit will be handled later /
285	row_lower = (err_addr >> UMC_V12_0_MA_R0_BIT) & `0x1fffULL`;
286	row_lower &= ~BIT_ULL(adev->umc.flip_bits.flip_row_bit);
287
288	if (amdgpu_ip_version(adev, ip: GC_HWIP, inst: `0`) >= IP_VERSION(`9`, `5`, `0`)) {
289	row_high = (soc_pa >> adev->umc.flip_bits.r13_in_pa) & `0x3ULL`;
290	/ it's 2.25GB in each channel, from MCA address to PA*
291	* [R14 R13] is converted if the two bits value are 0x3,
292	* get them from PA instead of MCA address.
293	*/
294	row_lower \|= (row_high << `13`);
295	}
296
297	if (!err_data && !dump_addr)
298	goto out;
299
300	/ loop for all possibilities of retired bits /
301	for (column = `0`; column < retire_unit; column++) {
302	soc_pa = paddr_out->pa.pa;
303	for (i = `0`; i < bit_num; i++)
304	soc_pa \|= (((column >> i) & `0x1ULL`) << flip_bits[i]);
305
306	col = ((column & `0x7`) << `2`) \| col_lower;
307	/ handle extra row bit /
308	if (bit_num == RETIRE_FLIP_BITS_NUM)
309	row = ((column >> `3`) << adev->umc.flip_bits.flip_row_bit) \|
310	row_lower;
311
312	if (dump_addr)
313	dev_info(adev->dev,
314	"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
315	soc_pa, row, col, bank, channel_index);
316
317	if (err_data)
318	amdgpu_umc_fill_error_record(err_data, err_addr,
319	retired_page: soc_pa, channel_index, umc_inst);
320	}
321
322	out:
323	return ret;
324	}
325
326	static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
327	uint32_t node_inst, uint32_t umc_inst,
328	uint32_t ch_inst, void *data)
329	{
330	struct ras_err_data err_data = (struct* ras_err_data *)data;
331	struct ta_ras_query_address_input addr_in;
332	uint64_t mc_umc_status_addr;
333	uint64_t mc_umc_status, err_addr;
334	uint64_t mc_umc_addrt0;
335	uint64_t umc_reg_offset =
336	get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
337
338	mc_umc_status_addr =
339	SOC15_REG_OFFSET(UMC, `0`, regMCA_UMC_UMC0_MCUMC_STATUST0);
340
341	mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * `4`);
342
343	if (mc_umc_status == `0`)
344	return `0`;
345
346	if (!err_data->err_addr) {
347	/ clear umc status /
348	WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * `4`, `0x0ULL`);
349
350	return `0`;
351	}
352
353	/ calculate error address if ue error is detected /
354	if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) \|\|
355	umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
356	mc_umc_addrt0 =
357	SOC15_REG_OFFSET(UMC, `0`, regMCA_UMC_UMC0_MCUMC_ADDRT0);
358
359	err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 + umc_reg_offset) * `4`);
360
361	err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
362
363	if (!adev->aid_mask &&
364	adev->smuio.funcs &&
365	adev->smuio.funcs->get_socket_id)
366	addr_in.ma.socket_id = adev->smuio.funcs->get_socket_id(adev);
367	else
368	addr_in.ma.socket_id = `0`;
369
370	addr_in.ma.err_addr = err_addr;
371	addr_in.ma.ch_inst = ch_inst;
372	addr_in.ma.umc_inst = umc_inst;
373	addr_in.ma.node_inst = node_inst;
374
375	umc_v12_0_convert_error_address(adev, err_data, addr_in: &addr_in, NULL, dump_addr: true);
376	}
377
378	/ clear umc status /
379	WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * `4`, `0x0ULL`);
380
381	return `0`;
382	}
383
384	static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev,
385	void *ras_error_status)
386	{
387	amdgpu_umc_loop_channels(adev,
388	func: umc_v12_0_query_error_address, data: ras_error_status);
389	}
390
391	static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
392	uint32_t node_inst, uint32_t umc_inst,
393	uint32_t ch_inst, void *data)
394	{
395	uint32_t odecc_cnt_sel;
396	uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr;
397	uint64_t umc_reg_offset =
398	get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
399
400	odecc_cnt_sel_addr =
401	SOC15_REG_OFFSET(UMC, `0`, regUMCCH0_OdEccCntSel);
402	odecc_err_cnt_addr =
403	SOC15_REG_OFFSET(UMC, `0`, regUMCCH0_OdEccErrCnt);
404
405	odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * `4`);
406
407	/ set ce error interrupt type to APIC based interrupt /
408	odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel,
409	OdEccErrInt, `0x1`);
410	WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * `4`, odecc_cnt_sel);
411
412	/ set error count to initial value /
413	WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * `4`, UMC_V12_0_CE_CNT_INIT);
414
415	return `0`;
416	}
417
418	static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
419	enum amdgpu_mca_error_type type, void *ras_error_status)
420	{
421	uint64_t mc_umc_status = (uint64_t )ras_error_status;
422
423	switch (type) {
424	case AMDGPU_MCA_ERROR_TYPE_UE:
425	return umc_v12_0_is_uncorrectable_error(adev, mc_umc_status);
426	case AMDGPU_MCA_ERROR_TYPE_CE:
427	return umc_v12_0_is_correctable_error(adev, mc_umc_status);
428	case AMDGPU_MCA_ERROR_TYPE_DE:
429	return umc_v12_0_is_deferred_error(adev, mc_umc_status);
430	default:
431	return false;
432	}
433
434	return false;
435	}
436
437	static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
438	{
439	amdgpu_umc_loop_channels(adev,
440	func: umc_v12_0_err_cnt_init_per_channel, NULL);
441	}
442
443	static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev)
444	{
445	/*
446	* Force return true, because regUMCCH0_EccCtrl
447	* is not accessible from host side
448	*/
449	return true;
450	}
451
452	const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = {
453	.query_ras_error_count = umc_v12_0_query_ras_error_count,
454	.query_ras_error_address = umc_v12_0_query_ras_error_address,
455	};
456
457	static int umc_v12_0_aca_bank_parser(struct aca_handle handle, struct* aca_bank *bank,
458	enum aca_smu_type type, void *data)
459	{
460	struct amdgpu_device *adev = handle->adev;
461	struct aca_bank_info info;
462	enum aca_error_type err_type;
463	u64 status, count;
464	u32 ext_error_code;
465	int ret;
466
467	status = bank->regs[ACA_REG_IDX_STATUS];
468	if (umc_v12_0_is_deferred_error(adev, mc_umc_status: status))
469	err_type = ACA_ERROR_TYPE_DEFERRED;
470	else if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status: status))
471	err_type = ACA_ERROR_TYPE_UE;
472	else if (umc_v12_0_is_correctable_error(adev, mc_umc_status: status))
473	err_type = ACA_ERROR_TYPE_CE;
474	else
475	return `0`;
476	bank->aca_err_type = err_type;
477
478	ret = aca_bank_info_decode(bank, info: &info);
479	if (ret)
480	return ret;
481
482	amdgpu_umc_update_ecc_status(adev,
483	status: bank->regs[ACA_REG_IDX_STATUS],
484	ipid: bank->regs[ACA_REG_IDX_IPID],
485	addr: bank->regs[ACA_REG_IDX_ADDR]);
486
487	ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status);
488	if (umc_v12_0_is_deferred_error(adev, mc_umc_status: status))
489	count = ext_error_code == `0` ?
490	adev->umc.err_addr_cnt / adev->umc.retire_unit : `1ULL`;
491	else
492	count = ext_error_code == `0` ?
493	ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]) : `1ULL`;
494
495	return aca_error_cache_log_bank_error(handle, info: &info, type: err_type, count);
496	}
497
498	static const struct aca_bank_ops umc_v12_0_aca_bank_ops = {
499	.aca_bank_parser = umc_v12_0_aca_bank_parser,
500	};
501
502	const struct aca_info umc_v12_0_aca_info = {
503	.hwip = ACA_HWIP_TYPE_UMC,
504	.mask = ACA_ERROR_UE_MASK \| ACA_ERROR_CE_MASK \| ACA_ERROR_DEFERRED_MASK,
505	.bank_ops = &umc_v12_0_aca_bank_ops,
506	};
507
508	static int umc_v12_0_ras_late_init(struct amdgpu_device adev, struct* ras_common_if *ras_block)
509	{
510	int ret;
511
512	ret = amdgpu_umc_ras_late_init(adev, ras_block);
513	if (ret)
514	return ret;
515
516	ret = amdgpu_ras_bind_aca(adev, blk: AMDGPU_RAS_BLOCK__UMC,
517	aca_info: &umc_v12_0_aca_info, NULL);
518	if (ret)
519	return ret;
520
521	return `0`;
522	}
523
524	static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
525	uint64_t status, uint64_t ipid, uint64_t addr)
526	{
527	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
528	uint16_t hwid, mcatype;
529	uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
530	uint64_t err_addr, pa_addr = `0`;
531	struct ras_ecc_err *ecc_err;
532	struct ta_ras_query_address_output addr_out;
533	uint32_t shift_bit = adev->umc.flip_bits.flip_bits_in_pa[`2`];
534	int count, ret, i;
535
536	hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
537	mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
538
539	/ The IP block decode of consumption is SMU /
540	if (hwid != MCA_UMC_HWID_V12_0 \|\| mcatype != MCA_UMC_MCATYPE_V12_0) {
541	con->umc_ecc_log.consumption_q_count++;
542	return `0`;
543	}
544
545	if (!status)
546	return `0`;
547
548	if (!umc_v12_0_is_deferred_error(adev, mc_umc_status: status))
549	return `0`;
550
551	err_addr = REG_GET_FIELD(addr,
552	MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
553
554	dev_dbg(adev->dev,
555	"UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n",
556	ipid,
557	MCA_IPID_2_SOCKET_ID(ipid),
558	MCA_IPID_2_DIE_ID(ipid),
559	MCA_IPID_2_UMC_INST(ipid),
560	MCA_IPID_2_UMC_CH(ipid),
561	err_addr);
562
563	ret = amdgpu_umc_mca_to_addr(adev,
564	err_addr, MCA_IPID_2_UMC_CH(ipid),
565	MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid),
566	MCA_IPID_2_SOCKET_ID(ipid), addr_out: &addr_out, dump_addr: true);
567	if (ret)
568	return ret;
569
570	ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
571	if (!ecc_err)
572	return -ENOMEM;
573
574	pa_addr = addr_out.pa.pa;
575	ecc_err->status = status;
576	ecc_err->ipid = ipid;
577	ecc_err->addr = addr;
578	ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT;
579	ecc_err->channel_idx = addr_out.pa.channel_idx;
580
581	/ If converted pa_pfn is 0, use pa C4 pfn. /
582	if (!ecc_err->pa_pfn)
583	ecc_err->pa_pfn = BIT_ULL(shift_bit) >> AMDGPU_GPU_PAGE_SHIFT;
584
585	ret = amdgpu_umc_logs_ecc_err(adev, ecc_tree: &con->umc_ecc_log.de_page_tree, ecc_err);
586	if (ret) {
587	if (ret == -EEXIST)
588	con->umc_ecc_log.de_queried_count++;
589	else
590	dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
591
592	kfree(objp: ecc_err);
593	return ret;
594	}
595
596	con->umc_ecc_log.de_queried_count++;
597
598	memset(page_pfn, `0`, sizeof(page_pfn));
599	count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
600	pa_addr,
601	pfns: page_pfn, ARRAY_SIZE(page_pfn));
602	if (count <= `0`) {
603	dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
604	return `0`;
605	}
606
607	/ Reserve memory /
608	for (i = `0`; i < count; i++)
609	amdgpu_ras_reserve_page(adev, pfn: page_pfn[i]);
610
611	/ The problem case is as follows:*
612	* 1. GPU A triggers a gpu ras reset, and GPU A drives
613	* GPU B to also perform a gpu ras reset.
614	* 2. After gpu B ras reset started, gpu B queried a DE
615	* data. Since the DE data was queried in the ras reset
616	* thread instead of the page retirement thread, bad
617	* page retirement work would not be triggered. Then
618	* even if all gpu resets are completed, the bad pages
619	* will be cached in RAM until GPU B's bad page retirement
620	* work is triggered again and then saved to eeprom.
621	* Trigger delayed work to save the bad pages to eeprom in time
622	* after gpu ras reset is completed.
623	*/
624	if (amdgpu_ras_in_recovery(adev))
625	schedule_delayed_work(dwork: &con->page_retirement_dwork,
626	delay: msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET));
627
628	return `0`;
629	}
630
631	static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
632	struct ras_ecc_err ecc_err, void* *ras_error_status)
633	{
634	struct ras_err_data err_data = (struct* ras_err_data *)ras_error_status;
635	uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
636	int ret, i, count;
637
638	if (!err_data \|\| !ecc_err)
639	return -EINVAL;
640
641	memset(page_pfn, `0`, sizeof(page_pfn));
642	count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
643	pa_addr: ecc_err->pa_pfn << AMDGPU_GPU_PAGE_SHIFT,
644	pfns: page_pfn, ARRAY_SIZE(page_pfn));
645
646	for (i = `0`; i < count; i++) {
647	ret = amdgpu_umc_fill_error_record(err_data,
648	err_addr: ecc_err->addr,
649	retired_page: page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
650	channel_index: ecc_err->channel_idx,
651	MCA_IPID_2_UMC_INST(ecc_err->ipid));
652	if (ret)
653	break;
654	}
655
656	err_data->de_count++;
657
658	return ret;
659	}
660
661	static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
662	void *ras_error_status)
663	{
664	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
665	struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
666	struct radix_tree_root *ecc_tree;
667	int new_detected, ret, i;
668
669	ecc_tree = &con->umc_ecc_log.de_page_tree;
670
671	mutex_lock(&con->umc_ecc_log.lock);
672	new_detected = radix_tree_gang_lookup_tag(ecc_tree, results: (void **)entries,
673	first_index: `0`, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
674	for (i = `0`; i < new_detected; i++) {
675	if (!entries[i])
676	continue;
677
678	ret = umc_v12_0_fill_error_record(adev, ecc_err: entries[i], ras_error_status);
679	if (ret) {
680	dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
681	break;
682	}
683	radix_tree_tag_clear(ecc_tree,
684	index: entries[i]->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
685	}
686	mutex_unlock(lock: &con->umc_ecc_log.lock);
687	}
688
689	static uint32_t umc_v12_0_get_die_id(struct amdgpu_device *adev,
690	uint64_t mca_addr, uint64_t retired_page)
691	{
692	uint32_t die = `0`;
693
694	/ we only calculate die id for nps1 mode right now /
695	die += ((((retired_page >> `12`) & `0x1ULL`)^
696	((retired_page >> `20`) & `0x1ULL`) ^
697	((retired_page >> `27`) & `0x1ULL`) ^
698	((retired_page >> `34`) & `0x1ULL`) ^
699	((retired_page >> `41`) & `0x1ULL`)) << `0`);
700
701	/ the original PA_C4 and PA_R13 may be cleared in retired_page, so*
702	* get them from mca_addr.
703	*/
704	die += ((((retired_page >> `13`) & `0x1ULL`) ^
705	((mca_addr >> `5`) & `0x1ULL`) ^
706	((retired_page >> `28`) & `0x1ULL`) ^
707	((mca_addr >> `23`) & `0x1ULL`) ^
708	((retired_page >> `42`) & `0x1ULL`)) << `1`);
709	die &= `3`;
710
711	return die;
712	}
713
714	static void umc_v12_0_mca_ipid_parse(struct amdgpu_device *adev, uint64_t ipid,
715	uint32_t did, uint32_t ch, uint32_t umc_inst, uint32_t sid)
716	{
717	if (did)
718	*did = MCA_IPID_2_DIE_ID(ipid);
719	if (ch)
720	*ch = MCA_IPID_2_UMC_CH(ipid);
721	if (umc_inst)
722	*umc_inst = MCA_IPID_2_UMC_INST(ipid);
723	if (sid)
724	*sid = MCA_IPID_2_SOCKET_ID(ipid);
725	}
726
727	struct amdgpu_umc_ras umc_v12_0_ras = {
728	.ras_block = {
729	.hw_ops = &umc_v12_0_ras_hw_ops,
730	.ras_late_init = umc_v12_0_ras_late_init,
731	},
732	.err_cnt_init = umc_v12_0_err_cnt_init,
733	.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
734	.ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
735	.check_ecc_err_status = umc_v12_0_check_ecc_err_status,
736	.update_ecc_status = umc_v12_0_update_ecc_status,
737	.convert_ras_err_addr = umc_v12_0_convert_error_address,
738	.get_die_id_from_pa = umc_v12_0_get_die_id,
739	.get_retire_flip_bits = umc_v12_0_get_retire_flip_bits,
740	.mca_ipid_parse = umc_v12_0_mca_ipid_parse,
741	};
742
743

source code of linux/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c