amdgpu_ras_eeprom.c source code [linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c]

1	/*
2	* Copyright 2019 Advanced Micro Devices, Inc.
3	*
4	* Permission is hereby granted, free of charge, to any person obtaining a
5	* copy of this software and associated documentation files (the "Software"),
6	* to deal in the Software without restriction, including without limitation
7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8	* and/or sell copies of the Software, and to permit persons to whom the
9	* Software is furnished to do so, subject to the following conditions:
10	*
11	* The above copyright notice and this permission notice shall be included in
12	* all copies or substantial portions of the Software.
13	*
14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17	* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20	* OTHER DEALINGS IN THE SOFTWARE.
21	*
22	*/
23
24	#include "amdgpu_ras_eeprom.h"
25	#include "amdgpu.h"
26	#include "amdgpu_ras.h"
27	#include <linux/bits.h>
28	#include "atom.h"
29	#include "amdgpu_eeprom.h"
30	#include "amdgpu_atomfirmware.h"
31	#include <linux/debugfs.h>
32	#include <linux/uaccess.h>
33
34	#include "amdgpu_reset.h"
35	#include "amdgpu_ras_mgr.h"
36
37	/ These are memory addresses as would be seen by one or more EEPROM*
38	* chips strung on the I2C bus, usually by manipulating pins 1-3 of a
39	* set of EEPROM devices. They form a continuous memory space.
40	*
41	* The I2C device address includes the device type identifier, 1010b,
42	* which is a reserved value and indicates that this is an I2C EEPROM
43	* device. It also includes the top 3 bits of the 19 bit EEPROM memory
44	* address, namely bits 18, 17, and 16. This makes up the 7 bit
45	* address sent on the I2C bus with bit 0 being the direction bit,
46	* which is not represented here, and sent by the hardware directly.
47	*
48	* For instance,
49	* 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0.
50	* 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h.
51	* 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h.
52	* Depending on the size of the I2C EEPROM device(s), bits 18:16 may
53	* address memory in a device or a device on the I2C bus, depending on
54	* the status of pins 1-3. See top of amdgpu_eeprom.c.
55	*
56	* The RAS table lives either at address 0 or address 40000h of EEPROM.
57	*/
58	#define EEPROM_I2C_MADDR_0 0x0
59	#define EEPROM_I2C_MADDR_4 0x40000
60
61	/*
62	* The 2 macros below represent the actual size in bytes that
63	* those entities occupy in the EEPROM memory.
64	* RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
65	* uses uint64 to store 6b fields such as retired_page.
66	*/
67	#define RAS_TABLE_HEADER_SIZE 20
68	#define RAS_TABLE_RECORD_SIZE 24
69
70	/ Table hdr is 'AMDR' /
71	#define RAS_TABLE_HDR_VAL 0x414d4452
72
73	/ Bad GPU tag ‘BADG’ /
74	#define RAS_TABLE_HDR_BAD 0x42414447
75
76	/*
77	* EEPROM Table structure v1
78	* ---------------------------------
79	* \| \|
80	* \| EEPROM TABLE HEADER \|
81	* \| ( size 20 Bytes ) \|
82	* \| \|
83	* ---------------------------------
84	* \| \|
85	* \| BAD PAGE RECORD AREA \|
86	* \| \|
87	* ---------------------------------
88	*/
89
90	/ Assume 2-Mbit size EEPROM and take up the whole space. /
91	#define RAS_TBL_SIZE_BYTES (256 * 1024)
92	#define RAS_TABLE_START 0
93	#define RAS_HDR_START RAS_TABLE_START
94	#define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
95	#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
96	/ RAS_TABLE_RECORD_SIZE)
97
98	/*
99	* EEPROM Table structrue v2.1
100	* ---------------------------------
101	* \| \|
102	* \| EEPROM TABLE HEADER \|
103	* \| ( size 20 Bytes ) \|
104	* \| \|
105	* ---------------------------------
106	* \| \|
107	* \| EEPROM TABLE RAS INFO \|
108	* \| (available info size 4 Bytes) \|
109	* \| ( reserved size 252 Bytes ) \|
110	* \| \|
111	* ---------------------------------
112	* \| \|
113	* \| BAD PAGE RECORD AREA \|
114	* \| \|
115	* ---------------------------------
116	*/
117
118	/ EEPROM Table V2_1 /
119	#define RAS_TABLE_V2_1_INFO_SIZE 256
120	#define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE
121	#define RAS_RECORD_START_V2_1 (RAS_HDR_START + RAS_TABLE_HEADER_SIZE + \
122	RAS_TABLE_V2_1_INFO_SIZE)
123	#define RAS_MAX_RECORD_COUNT_V2_1 ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE - \
124	RAS_TABLE_V2_1_INFO_SIZE) \
125	/ RAS_TABLE_RECORD_SIZE)
126
127	#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */
128
129	/ Given a zero-based index of an EEPROM RAS record, yields the EEPROM*
130	* offset off of RAS_TABLE_START. That is, this is something you can
131	* add to control->i2c_address, and then tell I2C layer to read
132	* from/write to there. _N is the so called absolute index,
133	* because it starts right after the table header.
134	*/
135	#define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \
136	(_N) * RAS_TABLE_RECORD_SIZE)
137
138	#define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
139	(_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
140
141	/ Given a 0-based relative record index, 0, 1, 2, ..., etc., off*
142	* of "fri", return the absolute record index off of the end of
143	* the table header.
144	*/
145	#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
146	(_C)->ras_max_record_count)
147
148	#define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
149	RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
150
151	#define RAS_NUM_RECS_V2_1(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
152	RAS_TABLE_HEADER_SIZE - \
153	RAS_TABLE_V2_1_INFO_SIZE) / RAS_TABLE_RECORD_SIZE)
154
155	#define to_amdgpu_device(x) ((container_of(x, struct amdgpu_ras, eeprom_control))->adev)
156
157	static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
158	{
159	switch (amdgpu_ip_version(adev, ip: MP1_HWIP, inst: `0`)) {
160	case IP_VERSION(`11`, `0`, `2`): / VEGA20 and ARCTURUS /
161	case IP_VERSION(`11`, `0`, `7`): / Sienna cichlid /
162	case IP_VERSION(`13`, `0`, `0`):
163	case IP_VERSION(`13`, `0`, `2`): / Aldebaran /
164	case IP_VERSION(`13`, `0`, `10`):
165	return true;
166	case IP_VERSION(`13`, `0`, `6`):
167	case IP_VERSION(`13`, `0`, `12`):
168	case IP_VERSION(`13`, `0`, `14`):
169	return (adev->gmc.is_app_apu) ? false : true;
170	default:
171	return false;
172	}
173	}
174
175	static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
176	struct amdgpu_ras_eeprom_control *control)
177	{
178	struct atom_context *atom_ctx = adev->mode_info.atom_context;
179	u8 i2c_addr;
180
181	if (!control)
182	return false;
183
184	if (adev->bios && amdgpu_atomfirmware_ras_rom_addr(adev, i2c_address: &i2c_addr)) {
185	/ The address given by VBIOS is an 8-bit, wire-format*
186	* address, i.e. the most significant byte.
187	*
188	* Normalize it to a 19-bit EEPROM address. Remove the
189	* device type identifier and make it a 7-bit address;
190	* then make it a 19-bit EEPROM address. See top of
191	* amdgpu_eeprom.c.
192	*/
193	i2c_addr = (i2c_addr & `0x0F`) >> `1`;
194	control->i2c_address = ((u32) i2c_addr) << `16`;
195
196	return true;
197	}
198
199	switch (amdgpu_ip_version(adev, ip: MP1_HWIP, inst: `0`)) {
200	case IP_VERSION(`11`, `0`, `2`):
201	/ VEGA20 and ARCTURUS /
202	if (adev->asic_type == CHIP_VEGA20)
203	control->i2c_address = EEPROM_I2C_MADDR_0;
204	else if (strnstr(atom_ctx->vbios_pn,
205	"D342",
206	sizeof(atom_ctx->vbios_pn)))
207	control->i2c_address = EEPROM_I2C_MADDR_0;
208	else
209	control->i2c_address = EEPROM_I2C_MADDR_4;
210	return true;
211	case IP_VERSION(`11`, `0`, `7`):
212	control->i2c_address = EEPROM_I2C_MADDR_0;
213	return true;
214	case IP_VERSION(`13`, `0`, `2`):
215	if (strnstr(atom_ctx->vbios_pn, "D673",
216	sizeof(atom_ctx->vbios_pn)))
217	control->i2c_address = EEPROM_I2C_MADDR_4;
218	else
219	control->i2c_address = EEPROM_I2C_MADDR_0;
220	return true;
221	case IP_VERSION(`13`, `0`, `0`):
222	if (strnstr(atom_ctx->vbios_pn, "D707",
223	sizeof(atom_ctx->vbios_pn)))
224	control->i2c_address = EEPROM_I2C_MADDR_0;
225	else
226	control->i2c_address = EEPROM_I2C_MADDR_4;
227	return true;
228	case IP_VERSION(`13`, `0`, `6`):
229	case IP_VERSION(`13`, `0`, `10`):
230	case IP_VERSION(`13`, `0`, `12`):
231	case IP_VERSION(`13`, `0`, `14`):
232	control->i2c_address = EEPROM_I2C_MADDR_4;
233	return true;
234	default:
235	return false;
236	}
237	}
238
239	static void
240	__encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr,
241	unsigned char *buf)
242	{
243	u32 pp = (uint32_t )buf;
244
245	pp[`0`] = cpu_to_le32(hdr->header);
246	pp[`1`] = cpu_to_le32(hdr->version);
247	pp[`2`] = cpu_to_le32(hdr->first_rec_offset);
248	pp[`3`] = cpu_to_le32(hdr->tbl_size);
249	pp[`4`] = cpu_to_le32(hdr->checksum);
250	}
251
252	static void
253	__decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr,
254	unsigned char *buf)
255	{
256	u32 pp = (uint32_t )buf;
257
258	hdr->header = le32_to_cpu(pp[`0`]);
259	hdr->version = le32_to_cpu(pp[`1`]);
260	hdr->first_rec_offset = le32_to_cpu(pp[`2`]);
261	hdr->tbl_size = le32_to_cpu(pp[`3`]);
262	hdr->checksum = le32_to_cpu(pp[`4`]);
263	}
264
265	static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
266	{
267	u8 buf[RAS_TABLE_HEADER_SIZE];
268	struct amdgpu_device *adev = to_amdgpu_device(control);
269	int res;
270
271	memset(buf, `0`, sizeof(buf));
272	__encode_table_header_to_buf(hdr: &control->tbl_hdr, buf);
273
274	/ i2c may be unstable in gpu reset /
275	down_read(sem: &adev->reset_domain->sem);
276	res = amdgpu_eeprom_write(i2c_adap: adev->pm.ras_eeprom_i2c_bus,
277	eeprom_addr: control->i2c_address +
278	control->ras_header_offset,
279	eeprom_buf: buf, RAS_TABLE_HEADER_SIZE);
280	up_read(sem: &adev->reset_domain->sem);
281
282	if (res < `0`) {
283	dev_err(adev->dev, "Failed to write EEPROM table header:%d",
284	res);
285	} else if (res < RAS_TABLE_HEADER_SIZE) {
286	dev_err(adev->dev, "Short write:%d out of %d\n", res,
287	RAS_TABLE_HEADER_SIZE);
288	res = -EIO;
289	} else {
290	res = `0`;
291	}
292
293	return res;
294	}
295
296	static void
297	__encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
298	unsigned char *buf)
299	{
300	u32 pp = (uint32_t )buf;
301	u32 tmp;
302
303	tmp = ((uint32_t)(rai->rma_status) & `0xFF`) \|
304	(((uint32_t)(rai->health_percent) << `8`) & `0xFF00`) \|
305	(((uint32_t)(rai->ecc_page_threshold) << `16`) & `0xFFFF0000`);
306	pp[`0`] = cpu_to_le32(tmp);
307	}
308
309	static void
310	__decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
311	unsigned char *buf)
312	{
313	u32 pp = (uint32_t )buf;
314	u32 tmp;
315
316	tmp = le32_to_cpu(pp[`0`]);
317	rai->rma_status = tmp & `0xFF`;
318	rai->health_percent = (tmp >> `8`) & `0xFF`;
319	rai->ecc_page_threshold = (tmp >> `16`) & `0xFFFF`;
320	}
321
322	static int __write_table_ras_info(struct amdgpu_ras_eeprom_control *control)
323	{
324	struct amdgpu_device *adev = to_amdgpu_device(control);
325	u8 *buf;
326	int res;
327
328	buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
329	if (!buf) {
330	dev_err(adev->dev,
331	"Failed to alloc buf to write table ras info\n");
332	return -ENOMEM;
333	}
334
335	__encode_table_ras_info_to_buf(rai: &control->tbl_rai, buf);
336
337	/ i2c may be unstable in gpu reset /
338	down_read(sem: &adev->reset_domain->sem);
339	res = amdgpu_eeprom_write(i2c_adap: adev->pm.ras_eeprom_i2c_bus,
340	eeprom_addr: control->i2c_address +
341	control->ras_info_offset,
342	eeprom_buf: buf, RAS_TABLE_V2_1_INFO_SIZE);
343	up_read(sem: &adev->reset_domain->sem);
344
345	if (res < `0`) {
346	dev_err(adev->dev, "Failed to write EEPROM table ras info:%d",
347	res);
348	} else if (res < RAS_TABLE_V2_1_INFO_SIZE) {
349	dev_err(adev->dev, "Short write:%d out of %d\n", res,
350	RAS_TABLE_V2_1_INFO_SIZE);
351	res = -EIO;
352	} else {
353	res = `0`;
354	}
355
356	kfree(objp: buf);
357
358	return res;
359	}
360
361	static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control)
362	{
363	int ii;
364	u8 *pp, csum;
365	size_t sz;
366
367	/ Header checksum, skip checksum field in the calculation /
368	sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum);
369	pp = (u8 *) &control->tbl_hdr;
370	csum = `0`;
371	for (ii = `0`; ii < sz; ii++, pp++)
372	csum += *pp;
373
374	return csum;
375	}
376
377	static u8 __calc_ras_info_byte_sum(const struct amdgpu_ras_eeprom_control *control)
378	{
379	int ii;
380	u8 *pp, csum;
381	size_t sz;
382
383	sz = sizeof(control->tbl_rai);
384	pp = (u8 *) &control->tbl_rai;
385	csum = `0`;
386	for (ii = `0`; ii < sz; ii++, pp++)
387	csum += *pp;
388
389	return csum;
390	}
391
392	static int amdgpu_ras_eeprom_correct_header_tag(
393	struct amdgpu_ras_eeprom_control *control,
394	uint32_t header)
395	{
396	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
397	u8 *hh;
398	int res;
399	u8 csum;
400
401	csum = -hdr->checksum;
402
403	hh = (void *) &hdr->header;
404	csum -= (hh[`0`] + hh[`1`] + hh[`2`] + hh[`3`]);
405	hh = (void *) &header;
406	csum += hh[`0`] + hh[`1`] + hh[`2`] + hh[`3`];
407	csum = -csum;
408	mutex_lock(&control->ras_tbl_mutex);
409	hdr->header = header;
410	hdr->checksum = csum;
411	res = __write_table_header(control);
412	mutex_unlock(lock: &control->ras_tbl_mutex);
413
414	return res;
415	}
416
417	static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control)
418	{
419	struct amdgpu_device *adev = to_amdgpu_device(control);
420	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
421
422	switch (amdgpu_ip_version(adev, ip: UMC_HWIP, inst: `0`)) {
423	case IP_VERSION(`8`, `10`, `0`):
424	hdr->version = RAS_TABLE_VER_V2_1;
425	return;
426	case IP_VERSION(`12`, `0`, `0`):
427	case IP_VERSION(`12`, `5`, `0`):
428	hdr->version = RAS_TABLE_VER_V3;
429	return;
430	default:
431	hdr->version = RAS_TABLE_VER_V1;
432	return;
433	}
434	}
435
436	/**
437	* amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
438	* @control: pointer to control structure
439	*
440	* Reset the contents of the header of the RAS EEPROM table.
441	* Return 0 on success, -errno on error.
442	*/
443	int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
444	{
445	struct amdgpu_device *adev = to_amdgpu_device(control);
446	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
447	struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
448	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
449	u32 erase_res = `0`;
450	u8 csum;
451	int res;
452
453	mutex_lock(&control->ras_tbl_mutex);
454
455	if (!amdgpu_ras_smu_eeprom_supported(adev)) {
456	hdr->header = RAS_TABLE_HDR_VAL;
457	amdgpu_ras_set_eeprom_table_version(control);
458
459	if (hdr->version >= RAS_TABLE_VER_V2_1) {
460	hdr->first_rec_offset = RAS_RECORD_START_V2_1;
461	hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
462	RAS_TABLE_V2_1_INFO_SIZE;
463	rai->rma_status = GPU_HEALTH_USABLE;
464
465	control->ras_record_offset = RAS_RECORD_START_V2_1;
466	control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
467	/**
468	* GPU health represented as a percentage.
469	* 0 means worst health, 100 means fully health.
470	*/
471	rai->health_percent = `100`;
472	/ ecc_page_threshold = 0 means disable bad page retirement /
473	rai->ecc_page_threshold = con->bad_page_cnt_threshold;
474	} else {
475	hdr->first_rec_offset = RAS_RECORD_START;
476	hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
477
478	control->ras_record_offset = RAS_RECORD_START;
479	control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
480	}
481
482	csum = __calc_hdr_byte_sum(control);
483	if (hdr->version >= RAS_TABLE_VER_V2_1)
484	csum += __calc_ras_info_byte_sum(control);
485	csum = -csum;
486	hdr->checksum = csum;
487	res = __write_table_header(control);
488	if (!res && hdr->version > RAS_TABLE_VER_V1)
489	res = __write_table_ras_info(control);
490	} else {
491	res = amdgpu_ras_smu_erase_ras_table(adev, result: &erase_res);
492	if (res \|\| erase_res) {
493	dev_warn(adev->dev, "RAS EEPROM reset failed, res:%d result:%d",
494	res, erase_res);
495	if (!res)
496	res = -EIO;
497	}
498	}
499
500	control->ras_num_recs = `0`;
501	control->ras_num_bad_pages = `0`;
502	control->ras_num_mca_recs = `0`;
503	control->ras_num_pa_recs = `0`;
504	control->ras_fri = `0`;
505
506	amdgpu_dpm_send_hbm_bad_pages_num(adev, size: control->ras_num_bad_pages);
507
508	control->bad_channel_bitmap = `0`;
509	amdgpu_dpm_send_hbm_bad_channel_flag(adev, size: control->bad_channel_bitmap);
510	con->update_channel_flag = false;
511
512	amdgpu_ras_debugfs_set_ret_size(control);
513
514	mutex_unlock(lock: &control->ras_tbl_mutex);
515
516	return res;
517	}
518
519	static void
520	__encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control,
521	struct eeprom_table_record *record,
522	unsigned char *buf)
523	{
524	__le64 tmp = `0`;
525	int i = `0`;
526
527	/ Next are all record fields according to EEPROM page spec in LE foramt /
528	buf[i++] = record->err_type;
529
530	buf[i++] = record->bank;
531
532	tmp = cpu_to_le64(record->ts);
533	memcpy(buf + i, &tmp, `8`);
534	i += `8`;
535
536	tmp = cpu_to_le64((record->offset & `0xffffffffffff`));
537	memcpy(buf + i, &tmp, `6`);
538	i += `6`;
539
540	buf[i++] = record->mem_channel;
541	buf[i++] = record->mcumc_id;
542
543	tmp = cpu_to_le64((record->retired_page & `0xffffffffffff`));
544	memcpy(buf + i, &tmp, `6`);
545	}
546
547	static void
548	__decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control,
549	struct eeprom_table_record *record,
550	unsigned char *buf)
551	{
552	__le64 tmp = `0`;
553	int i = `0`;
554
555	/ Next are all record fields according to EEPROM page spec in LE foramt /
556	record->err_type = buf[i++];
557
558	record->bank = buf[i++];
559
560	memcpy(&tmp, buf + i, `8`);
561	record->ts = le64_to_cpu(tmp);
562	i += `8`;
563
564	memcpy(&tmp, buf + i, `6`);
565	record->offset = (le64_to_cpu(tmp) & `0xffffffffffff`);
566	i += `6`;
567
568	record->mem_channel = buf[i++];
569	record->mcumc_id = buf[i++];
570
571	memcpy(&tmp, buf + i, `6`);
572	record->retired_page = (le64_to_cpu(tmp) & `0xffffffffffff`);
573	}
574
575	bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
576	{
577	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
578
579	if (amdgpu_uniras_enabled(adev))
580	return amdgpu_ras_mgr_check_eeprom_safety_watermark(adev);
581
582	if (!__is_ras_eeprom_supported(adev) \|\|
583	!amdgpu_bad_page_threshold)
584	return false;
585
586	/ skip check eeprom table for VEGA20 Gaming /
587	if (!con)
588	return false;
589	else
590	if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC)))
591	return false;
592
593	if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
594	if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold)
595	dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
596	con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
597	if ((amdgpu_bad_page_threshold == -`1`) \|\|
598	(amdgpu_bad_page_threshold == -`2`)) {
599	dev_warn(adev->dev,
600	"Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n");
601	return false;
602	} else {
603	dev_warn(adev->dev,
604	"Please consider adjusting the customized threshold.\n");
605	return true;
606	}
607	}
608
609	return false;
610	}
611
612	/**
613	* __amdgpu_ras_eeprom_write -- write indexed from buffer to EEPROM
614	* @control: pointer to control structure
615	* @buf: pointer to buffer containing data to write
616	* @fri: start writing at this index
617	* @num: number of records to write
618	*
619	* The caller must hold the table mutex in @control.
620	* Return 0 on success, -errno otherwise.
621	*/
622	static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
623	u8 buf, const* u32 fri, const u32 num)
624	{
625	struct amdgpu_device *adev = to_amdgpu_device(control);
626	u32 buf_size;
627	int res;
628
629	/ i2c may be unstable in gpu reset /
630	down_read(sem: &adev->reset_domain->sem);
631	buf_size = num * RAS_TABLE_RECORD_SIZE;
632	res = amdgpu_eeprom_write(i2c_adap: adev->pm.ras_eeprom_i2c_bus,
633	eeprom_addr: control->i2c_address +
634	RAS_INDEX_TO_OFFSET(control, fri),
635	eeprom_buf: buf, bytes: buf_size);
636	up_read(sem: &adev->reset_domain->sem);
637	if (res < `0`) {
638	dev_err(adev->dev, "Writing %d EEPROM table records error:%d",
639	num, res);
640	} else if (res < buf_size) {
641	/ Short write, return error.*
642	*/
643	dev_err(adev->dev, "Wrote %d records out of %d",
644	res / RAS_TABLE_RECORD_SIZE, num);
645	res = -EIO;
646	} else {
647	res = `0`;
648	}
649
650	return res;
651	}
652
653	static int
654	amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
655	struct eeprom_table_record *record,
656	const u32 num)
657	{
658	struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
659	struct amdgpu_device *adev = to_amdgpu_device(control);
660	u32 a, b, i;
661	u8 buf, pp;
662	int res;
663
664	buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
665	if (!buf)
666	return -ENOMEM;
667
668	/ Encode all of them in one go.*
669	*/
670	pp = buf;
671	for (i = `0`; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
672	__encode_table_record_to_buf(control, record: &record[i], buf: pp);
673
674	/ update bad channel bitmap /
675	if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) &&
676	!(control->bad_channel_bitmap & (`1` << record[i].mem_channel))) {
677	control->bad_channel_bitmap \|= `1` << record[i].mem_channel;
678	con->update_channel_flag = true;
679	}
680	}
681
682	/ a, first record index to write into.*
683	* b, last record index to write into.
684	* a = first index to read (fri) + number of records in the table,
685	* b = a + @num - 1.
686	* Let N = control->ras_max_num_record_count, then we have,
687	* case 0: 0 <= a <= b < N,
688	* just append @num records starting at a;
689	* case 1: 0 <= a < N <= b,
690	* append (N - a) records starting at a, and
691	* append the remainder, b % N + 1, starting at 0.
692	* case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases,
693	* case 2a: 0 <= a <= b < N
694	* append num records starting at a; and fix fri if b overwrote it,
695	* and since a <= b, if b overwrote it then a must've also,
696	* and if b didn't overwrite it, then a didn't also.
697	* case 2b: 0 <= b < a < N
698	* write num records starting at a, which wraps around 0=N
699	* and overwrite fri unconditionally. Now from case 2a,
700	* this means that b eclipsed fri to overwrite it and wrap
701	* around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally
702	* set fri = b + 1 (mod N).
703	* Now, since fri is updated in every case, except the trivial case 0,
704	* the number of records present in the table after writing, is,
705	* num_recs - 1 = b - fri (mod N), and we take the positive value,
706	* by adding an arbitrary multiple of N before taking the modulo N
707	* as shown below.
708	*/
709	a = control->ras_fri + control->ras_num_recs;
710	b = a + num - `1`;
711	if (b < control->ras_max_record_count) {
712	res = __amdgpu_ras_eeprom_write(control, buf, fri: a, num);
713	} else if (a < control->ras_max_record_count) {
714	u32 g0, g1;
715
716	g0 = control->ras_max_record_count - a;
717	g1 = b % control->ras_max_record_count + `1`;
718	res = __amdgpu_ras_eeprom_write(control, buf, fri: a, num: g0);
719	if (res)
720	goto Out;
721	res = __amdgpu_ras_eeprom_write(control,
722	buf: buf + g0 * RAS_TABLE_RECORD_SIZE,
723	fri: `0`, num: g1);
724	if (res)
725	goto Out;
726	if (g1 > control->ras_fri)
727	control->ras_fri = g1 % control->ras_max_record_count;
728	} else {
729	a %= control->ras_max_record_count;
730	b %= control->ras_max_record_count;
731
732	if (a <= b) {
733	/ Note that, b - a + 1 = num. /
734	res = __amdgpu_ras_eeprom_write(control, buf, fri: a, num);
735	if (res)
736	goto Out;
737	if (b >= control->ras_fri)
738	control->ras_fri = (b + `1`) % control->ras_max_record_count;
739	} else {
740	u32 g0, g1;
741
742	/ b < a, which means, we write from*
743	* a to the end of the table, and from
744	* the start of the table to b.
745	*/
746	g0 = control->ras_max_record_count - a;
747	g1 = b + `1`;
748	res = __amdgpu_ras_eeprom_write(control, buf, fri: a, num: g0);
749	if (res)
750	goto Out;
751	res = __amdgpu_ras_eeprom_write(control,
752	buf: buf + g0 * RAS_TABLE_RECORD_SIZE,
753	fri: `0`, num: g1);
754	if (res)
755	goto Out;
756	control->ras_fri = g1 % control->ras_max_record_count;
757	}
758	}
759	control->ras_num_recs = `1` + (control->ras_max_record_count + b
760	- control->ras_fri)
761	% control->ras_max_record_count;
762
763	/old asics only save pa to eeprom like before/
764	if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, `0`)) < `12`)
765	control->ras_num_pa_recs += num;
766	else
767	control->ras_num_mca_recs += num;
768
769	control->ras_num_bad_pages = con->bad_page_num;
770	Out:
771	kfree(objp: buf);
772	return res;
773	}
774
775	static int
776	amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
777	{
778	struct amdgpu_device *adev = to_amdgpu_device(control);
779	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
780	u8 buf, pp, csum;
781	u32 buf_size;
782	int res;
783
784	/ Modify the header if it exceeds.*
785	*/
786	if (amdgpu_bad_page_threshold != `0` &&
787	control->ras_num_bad_pages > ras->bad_page_cnt_threshold) {
788	dev_warn(adev->dev,
789	"Saved bad pages %d reaches threshold value %d\n",
790	control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
791
792	if (adev->cper.enabled && !amdgpu_uniras_enabled(adev) &&
793	amdgpu_cper_generate_bp_threshold_record(adev))
794	dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
795
796	if ((amdgpu_bad_page_threshold != -`1`) &&
797	(amdgpu_bad_page_threshold != -`2`)) {
798	control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
799	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
800	control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
801	control->tbl_rai.health_percent = `0`;
802	}
803	ras->is_rma = true;
804	}
805
806	/ ignore the -ENOTSUPP return value /
807	amdgpu_dpm_send_rma_reason(adev);
808	}
809
810	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
811	control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
812	RAS_TABLE_V2_1_INFO_SIZE +
813	control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
814	else
815	control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
816	control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
817	control->tbl_hdr.checksum = `0`;
818
819	buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
820	buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
821	if (!buf) {
822	dev_err(adev->dev,
823	"allocating memory for table of size %d bytes failed\n",
824	control->tbl_hdr.tbl_size);
825	res = -ENOMEM;
826	goto Out;
827	}
828
829	down_read(sem: &adev->reset_domain->sem);
830	res = amdgpu_eeprom_read(i2c_adap: adev->pm.ras_eeprom_i2c_bus,
831	eeprom_addr: control->i2c_address +
832	control->ras_record_offset,
833	eeprom_buf: buf, bytes: buf_size);
834	up_read(sem: &adev->reset_domain->sem);
835	if (res < `0`) {
836	dev_err(adev->dev, "EEPROM failed reading records:%d\n", res);
837	goto Out;
838	} else if (res < buf_size) {
839	dev_err(adev->dev, "EEPROM read %d out of %d bytes\n", res,
840	buf_size);
841	res = -EIO;
842	goto Out;
843	}
844
845	/**
846	* bad page records have been stored in eeprom,
847	* now calculate gpu health percent
848	*/
849	if (amdgpu_bad_page_threshold != `0` &&
850	control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
851	control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
852	control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
853	control->ras_num_bad_pages) * `100`) /
854	ras->bad_page_cnt_threshold;
855
856	/ Recalc the checksum.*
857	*/
858	csum = `0`;
859	for (pp = buf; pp < buf + buf_size; pp++)
860	csum += *pp;
861
862	csum += __calc_hdr_byte_sum(control);
863	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
864	csum += __calc_ras_info_byte_sum(control);
865	/ avoid sign extension when assigning to "checksum" /
866	csum = -csum;
867	control->tbl_hdr.checksum = csum;
868	res = __write_table_header(control);
869	if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1)
870	res = __write_table_ras_info(control);
871	Out:
872	kfree(objp: buf);
873	return res;
874	}
875
876	int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control)
877	{
878	struct amdgpu_device *adev = to_amdgpu_device(control);
879	int ret, retry = `20`;
880
881	if (!amdgpu_ras_smu_eeprom_supported(adev))
882	return `0`;
883
884	control->ras_num_recs_old = control->ras_num_recs;
885
886	do {
887	/ 1000ms timeout is long enough, smu_get_badpage_count won't*
888	* return -EBUSY before timeout.
889	*/
890	ret = amdgpu_ras_smu_get_badpage_count(adev,
891	count: &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS);
892	if (!ret &&
893	(control->ras_num_recs_old == control->ras_num_recs)) {
894	/ record number update in PMFW needs some time,*
895	* smu_get_badpage_count may return immediately without
896	* count update, sleep for a while and retry again.
897	*/
898	msleep(msecs: `50`);
899	retry--;
900	} else {
901	break;
902	}
903	} while (retry);
904
905	/ no update of record number is not a real failure,*
906	* don't print warning here
907	*/
908	if (!ret && (control->ras_num_recs_old == control->ras_num_recs))
909	ret = -EINVAL;
910
911	return ret;
912	}
913
914	static int amdgpu_ras_smu_eeprom_append(struct amdgpu_ras_eeprom_control *control)
915	{
916	struct amdgpu_device *adev = to_amdgpu_device(control);
917	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
918
919	if (!amdgpu_ras_smu_eeprom_supported(adev) \|\| !con)
920	return `0`;
921
922	control->ras_num_bad_pages = con->bad_page_num;
923
924	if (amdgpu_bad_page_threshold != `0` &&
925	control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
926	dev_warn(adev->dev,
927	"Saved bad pages %d reaches threshold value %d\n",
928	control->ras_num_bad_pages, con->bad_page_cnt_threshold);
929
930	if (adev->cper.enabled && amdgpu_cper_generate_bp_threshold_record(adev))
931	dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
932
933	if ((amdgpu_bad_page_threshold != -`1`) &&
934	(amdgpu_bad_page_threshold != -`2`))
935	con->is_rma = true;
936	}
937
938	return `0`;
939	}
940
941	/**
942	* amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
943	* @control: pointer to control structure
944	* @record: array of records to append
945	* @num: number of records in @record array
946	*
947	* Append @num records to the table, calculate the checksum and write
948	* the table back to EEPROM. The maximum number of records that
949	* can be appended is between 1 and control->ras_max_record_count,
950	* regardless of how many records are already stored in the table.
951	*
952	* Return 0 on success or if EEPROM is not supported, -errno on error.
953	*/
954	int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
955	struct eeprom_table_record *record,
956	const u32 num)
957	{
958	struct amdgpu_device *adev = to_amdgpu_device(control);
959	int res, i;
960	uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
961
962	if (!__is_ras_eeprom_supported(adev))
963	return `0`;
964
965	if (amdgpu_ras_smu_eeprom_supported(adev))
966	return amdgpu_ras_smu_eeprom_append(control);
967
968	if (num == `0`) {
969	dev_err(adev->dev, "will not append 0 records\n");
970	return -EINVAL;
971	} else if (num > control->ras_max_record_count) {
972	dev_err(adev->dev,
973	"cannot append %d records than the size of table %d\n",
974	num, control->ras_max_record_count);
975	return -EINVAL;
976	}
977
978	if (adev->gmc.gmc_funcs->query_mem_partition_mode)
979	nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
980
981	/ set the new channel index flag /
982	for (i = `0`; i < num; i++)
983	record[i].retired_page \|= (nps << UMC_NPS_SHIFT);
984
985	mutex_lock(&control->ras_tbl_mutex);
986
987	res = amdgpu_ras_eeprom_append_table(control, record, num);
988	if (!res)
989	res = amdgpu_ras_eeprom_update_header(control);
990	if (!res)
991	amdgpu_ras_debugfs_set_ret_size(control);
992
993	mutex_unlock(lock: &control->ras_tbl_mutex);
994
995	/ clear channel index flag, the flag is only saved on eeprom /
996	for (i = `0`; i < num; i++)
997	record[i].retired_page &= ~(nps << UMC_NPS_SHIFT);
998
999	return res;
1000	}
1001
1002	/**
1003	* __amdgpu_ras_eeprom_read -- read indexed from EEPROM into buffer
1004	* @control: pointer to control structure
1005	* @buf: pointer to buffer to read into
1006	* @fri: first record index, start reading at this index, absolute index
1007	* @num: number of records to read
1008	*
1009	* The caller must hold the table mutex in @control.
1010	* Return 0 on success, -errno otherwise.
1011	*/
1012	static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
1013	u8 buf, const* u32 fri, const u32 num)
1014	{
1015	struct amdgpu_device *adev = to_amdgpu_device(control);
1016	u32 buf_size;
1017	int res;
1018
1019	/ i2c may be unstable in gpu reset /
1020	down_read(sem: &adev->reset_domain->sem);
1021	buf_size = num * RAS_TABLE_RECORD_SIZE;
1022	res = amdgpu_eeprom_read(i2c_adap: adev->pm.ras_eeprom_i2c_bus,
1023	eeprom_addr: control->i2c_address +
1024	RAS_INDEX_TO_OFFSET(control, fri),
1025	eeprom_buf: buf, bytes: buf_size);
1026	up_read(sem: &adev->reset_domain->sem);
1027	if (res < `0`) {
1028	dev_err(adev->dev, "Reading %d EEPROM table records error:%d",
1029	num, res);
1030	} else if (res < buf_size) {
1031	/ Short read, return error.*
1032	*/
1033	dev_err(adev->dev, "Read %d records out of %d",
1034	res / RAS_TABLE_RECORD_SIZE, num);
1035	res = -EIO;
1036	} else {
1037	res = `0`;
1038	}
1039
1040	return res;
1041	}
1042
1043	int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
1044	struct eeprom_table_record *record, u32 rec_idx,
1045	const u32 num)
1046	{
1047	struct amdgpu_device *adev = to_amdgpu_device(control);
1048	uint64_t ts, end_idx;
1049	int i, ret;
1050	u64 mca, ipid;
1051
1052	if (!amdgpu_ras_smu_eeprom_supported(adev))
1053	return `0`;
1054
1055	if (!adev->umc.ras \|\| !adev->umc.ras->mca_ipid_parse)
1056	return -EOPNOTSUPP;
1057
1058	end_idx = rec_idx + num;
1059	for (i = rec_idx; i < end_idx; i++) {
1060	ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, index: i, mca_addr: &mca);
1061	if (ret)
1062	return ret;
1063
1064	ret = amdgpu_ras_smu_get_badpage_ipid(adev, index: i, ipid: &ipid);
1065	if (ret)
1066	return ret;
1067
1068	ret = amdgpu_ras_smu_get_timestamp(adev, index: i, timestamp: &ts);
1069	if (ret)
1070	return ret;
1071
1072	record[i - rec_idx].address = mca;
1073	/ retired_page (pa) is unused now /
1074	record[i - rec_idx].retired_page = `0x1ULL`;
1075	record[i - rec_idx].ts = ts;
1076	record[i - rec_idx].err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
1077
1078	adev->umc.ras->mca_ipid_parse(adev, ipid,
1079	(uint32_t *)&(record[i - rec_idx].cu),
1080	(uint32_t *)&(record[i - rec_idx].mem_channel),
1081	(uint32_t *)&(record[i - rec_idx].mcumc_id), NULL);
1082	}
1083
1084	return `0`;
1085	}
1086
1087	/**
1088	* amdgpu_ras_eeprom_read -- read EEPROM
1089	* @control: pointer to control structure
1090	* @record: array of records to read into
1091	* @num: number of records in @record
1092	*
1093	* Reads num records from the RAS table in EEPROM and
1094	* writes the data into @record array.
1095	*
1096	* Returns 0 on success, -errno on error.
1097	*/
1098	int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
1099	struct eeprom_table_record *record,
1100	const u32 num)
1101	{
1102	struct amdgpu_device *adev = to_amdgpu_device(control);
1103	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1104	int i, res;
1105	u8 buf, pp;
1106	u32 g0, g1;
1107
1108	if (amdgpu_ras_smu_eeprom_supported(adev))
1109	return amdgpu_ras_eeprom_read_idx(control, record, rec_idx: `0`, num);
1110
1111	if (!__is_ras_eeprom_supported(adev))
1112	return `0`;
1113
1114	if (num == `0`) {
1115	dev_err(adev->dev, "will not read 0 records\n");
1116	return -EINVAL;
1117	} else if (num > control->ras_num_recs) {
1118	dev_err(adev->dev, "too many records to read:%d available:%d\n",
1119	num, control->ras_num_recs);
1120	return -EINVAL;
1121	}
1122
1123	buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
1124	if (!buf)
1125	return -ENOMEM;
1126
1127	/ Determine how many records to read, from the first record*
1128	* index, fri, to the end of the table, and from the beginning
1129	* of the table, such that the total number of records is
1130	* @num, and we handle wrap around when fri > 0 and
1131	* fri + num > RAS_MAX_RECORD_COUNT.
1132	*
1133	* First we compute the index of the last element
1134	* which would be fetched from each region,
1135	* g0 is in [fri, fri + num - 1], and
1136	* g1 is in [0, RAS_MAX_RECORD_COUNT - 1].
1137	* Then, if g0 < RAS_MAX_RECORD_COUNT, the index of
1138	* the last element to fetch, we set g0 to _the number_
1139	* of elements to fetch, @num, since we know that the last
1140	* indexed to be fetched does not exceed the table.
1141	*
1142	* If, however, g0 >= RAS_MAX_RECORD_COUNT, then
1143	* we set g0 to the number of elements to read
1144	* until the end of the table, and g1 to the number of
1145	* elements to read from the beginning of the table.
1146	*/
1147	g0 = control->ras_fri + num - `1`;
1148	g1 = g0 % control->ras_max_record_count;
1149	if (g0 < control->ras_max_record_count) {
1150	g0 = num;
1151	g1 = `0`;
1152	} else {
1153	g0 = control->ras_max_record_count - control->ras_fri;
1154	g1 += `1`;
1155	}
1156
1157	mutex_lock(&control->ras_tbl_mutex);
1158	res = __amdgpu_ras_eeprom_read(control, buf, fri: control->ras_fri, num: g0);
1159	if (res)
1160	goto Out;
1161	if (g1) {
1162	res = __amdgpu_ras_eeprom_read(control,
1163	buf: buf + g0 * RAS_TABLE_RECORD_SIZE,
1164	fri: `0`, num: g1);
1165	if (res)
1166	goto Out;
1167	}
1168
1169	res = `0`;
1170
1171	/ Read up everything? Then transform.*
1172	*/
1173	pp = buf;
1174	for (i = `0`; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
1175	__decode_table_record_from_buf(control, record: &record[i], buf: pp);
1176
1177	/ update bad channel bitmap /
1178	if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) &&
1179	!(control->bad_channel_bitmap & (`1` << record[i].mem_channel))) {
1180	control->bad_channel_bitmap \|= `1` << record[i].mem_channel;
1181	con->update_channel_flag = true;
1182	}
1183	}
1184	Out:
1185	kfree(objp: buf);
1186	mutex_unlock(lock: &control->ras_tbl_mutex);
1187
1188	return res;
1189	}
1190
1191	uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control)
1192	{
1193	/ get available eeprom table version first before eeprom table init /
1194	amdgpu_ras_set_eeprom_table_version(control);
1195
1196	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
1197	return RAS_MAX_RECORD_COUNT_V2_1;
1198	else
1199	return RAS_MAX_RECORD_COUNT;
1200	}
1201
1202	static ssize_t
1203	amdgpu_ras_debugfs_eeprom_size_read(struct file f, char* __user *buf,
1204	size_t size, loff_t *pos)
1205	{
1206	struct amdgpu_device adev = (struct* amdgpu_device *)file_inode(f)->i_private;
1207	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1208	struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
1209	u8 data[`50`];
1210	int res;
1211
1212	if (!size)
1213	return size;
1214
1215	if (!ras \|\| !control) {
1216	res = snprintf(buf: data, size: sizeof(data), fmt: "Not supported\n");
1217	} else {
1218	res = snprintf(buf: data, size: sizeof(data), fmt: "%d bytes or %d records\n",
1219	RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
1220	}
1221
1222	if (*pos >= res)
1223	return `0`;
1224
1225	res -= *pos;
1226	res = min_t(size_t, res, size);
1227
1228	if (copy_to_user(to: buf, from: &data[*pos], n: res))
1229	return -EFAULT;
1230
1231	*pos += res;
1232
1233	return res;
1234	}
1235
1236	const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
1237	.owner = THIS_MODULE,
1238	.read = amdgpu_ras_debugfs_eeprom_size_read,
1239	.write = NULL,
1240	.llseek = default_llseek,
1241	};
1242
1243	static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n";
1244	static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
1245	#define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
1246	static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n";
1247	static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n";
1248	#define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)
1249
1250	static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
1251	"ignore",
1252	"re",
1253	"ue",
1254	};
1255
1256	static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
1257	{
1258	return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
1259	strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
1260	}
1261
1262	void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
1263	{
1264	struct amdgpu_ras ras = container_of(control, struct* amdgpu_ras,
1265	eeprom_control);
1266	struct dentry *de = ras->de_ras_eeprom_table;
1267
1268	if (de)
1269	d_inode(dentry: de)->i_size = amdgpu_ras_debugfs_table_size(control);
1270	}
1271
1272	static ssize_t amdgpu_ras_debugfs_table_read(struct file f, char* __user *buf,
1273	size_t size, loff_t *pos)
1274	{
1275	struct amdgpu_device adev = (struct* amdgpu_device *)file_inode(f)->i_private;
1276	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1277	struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
1278	const size_t orig_size = size;
1279	int res = -EFAULT;
1280	size_t data_len;
1281
1282	/ pmfw manages eeprom data by itself /
1283	if (amdgpu_ras_smu_eeprom_supported(adev))
1284	return `0`;
1285
1286	mutex_lock(&control->ras_tbl_mutex);
1287
1288	/ We want pos - data_len > 0, which means there's
1289	* bytes to be printed from data.
1290	*/
1291	data_len = strlen(tbl_hdr_str);
1292	if (*pos < data_len) {
1293	data_len -= *pos;
1294	data_len = min_t(size_t, data_len, size);
1295	if (copy_to_user(to: buf, from: &tbl_hdr_str[*pos], n: data_len))
1296	goto Out;
1297	buf += data_len;
1298	size -= data_len;
1299	*pos += data_len;
1300	}
1301
1302	data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
1303	if (*pos < data_len && size > `0`) {
1304	u8 data[tbl_hdr_fmt_size + `1`];
1305	loff_t lpos;
1306
1307	snprintf(buf: data, size: sizeof(data), fmt: tbl_hdr_fmt,
1308	control->tbl_hdr.header,
1309	control->tbl_hdr.version,
1310	control->tbl_hdr.first_rec_offset,
1311	control->tbl_hdr.tbl_size,
1312	control->tbl_hdr.checksum);
1313
1314	data_len -= *pos;
1315	data_len = min_t(size_t, data_len, size);
1316	lpos = *pos - strlen(tbl_hdr_str);
1317	if (copy_to_user(to: buf, from: &data[lpos], n: data_len))
1318	goto Out;
1319	buf += data_len;
1320	size -= data_len;
1321	*pos += data_len;
1322	}
1323
1324	data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
1325	if (*pos < data_len && size > `0`) {
1326	loff_t lpos;
1327
1328	data_len -= *pos;
1329	data_len = min_t(size_t, data_len, size);
1330	lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
1331	if (copy_to_user(to: buf, from: &rec_hdr_str[lpos], n: data_len))
1332	goto Out;
1333	buf += data_len;
1334	size -= data_len;
1335	*pos += data_len;
1336	}
1337
1338	data_len = amdgpu_ras_debugfs_table_size(control);
1339	if (*pos < data_len && size > `0`) {
1340	u8 dare[RAS_TABLE_RECORD_SIZE];
1341	u8 data[rec_hdr_fmt_size + `1`];
1342	struct eeprom_table_record record;
1343	int s, r;
1344
1345	/ Find the starting record index*
1346	*/
1347	s = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
1348	strlen(rec_hdr_str);
1349	s = s / rec_hdr_fmt_size;
1350	r = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
1351	strlen(rec_hdr_str);
1352	r = r % rec_hdr_fmt_size;
1353
1354	for ( ; size > `0` && s < control->ras_num_recs; s++) {
1355	u32 ai = RAS_RI_TO_AI(control, s);
1356	/ Read a single record*
1357	*/
1358	res = __amdgpu_ras_eeprom_read(control, buf: dare, fri: ai, num: `1`);
1359	if (res)
1360	goto Out;
1361	__decode_table_record_from_buf(control, record: &record, buf: dare);
1362	snprintf(buf: data, size: sizeof(data), fmt: rec_hdr_fmt,
1363	s,
1364	RAS_INDEX_TO_OFFSET(control, ai),
1365	record_err_type_str[record.err_type],
1366	record.bank,
1367	record.ts,
1368	record.offset,
1369	record.mem_channel,
1370	record.mcumc_id,
1371	record.retired_page);
1372
1373	data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
1374	if (copy_to_user(to: buf, from: &data[r], n: data_len)) {
1375	res = -EFAULT;
1376	goto Out;
1377	}
1378	buf += data_len;
1379	size -= data_len;
1380	*pos += data_len;
1381	r = `0`;
1382	}
1383	}
1384	res = `0`;
1385	Out:
1386	mutex_unlock(lock: &control->ras_tbl_mutex);
1387	return res < `0` ? res : orig_size - size;
1388	}
1389
1390	static ssize_t
1391	amdgpu_ras_debugfs_eeprom_table_read(struct file f, char* __user *buf,
1392	size_t size, loff_t *pos)
1393	{
1394	struct amdgpu_device adev = (struct* amdgpu_device *)file_inode(f)->i_private;
1395	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1396	struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
1397	u8 data[`81`];
1398	int res;
1399
1400	if (!size)
1401	return size;
1402
1403	if (!ras \|\| !control) {
1404	res = snprintf(buf: data, size: sizeof(data), fmt: "Not supported\n");
1405	if (*pos >= res)
1406	return `0`;
1407
1408	res -= *pos;
1409	res = min_t(size_t, res, size);
1410
1411	if (copy_to_user(to: buf, from: &data[*pos], n: res))
1412	return -EFAULT;
1413
1414	*pos += res;
1415
1416	return res;
1417	} else {
1418	return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
1419	}
1420	}
1421
1422	const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
1423	.owner = THIS_MODULE,
1424	.read = amdgpu_ras_debugfs_eeprom_table_read,
1425	.write = NULL,
1426	.llseek = default_llseek,
1427	};
1428
1429	/**
1430	* __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
1431	* @control: pointer to control structure
1432	*
1433	* Check the checksum of the stored in EEPROM RAS table.
1434	*
1435	* Return 0 if the checksum is correct,
1436	* positive if it is not correct, and
1437	* -errno on I/O error.
1438	*/
1439	static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control)
1440	{
1441	struct amdgpu_device *adev = to_amdgpu_device(control);
1442	int buf_size, res;
1443	u8 csum, buf, pp;
1444
1445	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
1446	buf_size = RAS_TABLE_HEADER_SIZE +
1447	RAS_TABLE_V2_1_INFO_SIZE +
1448	control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
1449	else
1450	buf_size = RAS_TABLE_HEADER_SIZE +
1451	control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
1452
1453	buf = kzalloc(buf_size, GFP_KERNEL);
1454	if (!buf) {
1455	dev_err(adev->dev,
1456	"Out of memory checking RAS table checksum.\n");
1457	return -ENOMEM;
1458	}
1459
1460	res = amdgpu_eeprom_read(i2c_adap: adev->pm.ras_eeprom_i2c_bus,
1461	eeprom_addr: control->i2c_address +
1462	control->ras_header_offset,
1463	eeprom_buf: buf, bytes: buf_size);
1464	if (res < buf_size) {
1465	dev_err(adev->dev, "Partial read for checksum, res:%d\n", res);
1466	/ On partial reads, return -EIO.*
1467	*/
1468	if (res >= `0`)
1469	res = -EIO;
1470	goto Out;
1471	}
1472
1473	csum = `0`;
1474	for (pp = buf; pp < buf + buf_size; pp++)
1475	csum += *pp;
1476	Out:
1477	kfree(objp: buf);
1478	return res < `0` ? res : csum;
1479	}
1480
1481	static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control)
1482	{
1483	struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
1484	struct amdgpu_device *adev = to_amdgpu_device(control);
1485	unsigned char *buf;
1486	int res;
1487
1488	buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
1489	if (!buf) {
1490	dev_err(adev->dev,
1491	"Failed to alloc buf to read EEPROM table ras info\n");
1492	return -ENOMEM;
1493	}
1494
1495	/**
1496	* EEPROM table V2_1 supports ras info,
1497	* read EEPROM table ras info
1498	*/
1499	res = amdgpu_eeprom_read(i2c_adap: adev->pm.ras_eeprom_i2c_bus,
1500	eeprom_addr: control->i2c_address + control->ras_info_offset,
1501	eeprom_buf: buf, RAS_TABLE_V2_1_INFO_SIZE);
1502	if (res < RAS_TABLE_V2_1_INFO_SIZE) {
1503	dev_err(adev->dev,
1504	"Failed to read EEPROM table ras info, res:%d", res);
1505	res = res >= `0` ? -EIO : res;
1506	goto Out;
1507	}
1508
1509	__decode_table_ras_info_from_buf(rai, buf);
1510
1511	Out:
1512	kfree(objp: buf);
1513	return res == RAS_TABLE_V2_1_INFO_SIZE ? `0` : res;
1514	}
1515
1516	static int amdgpu_ras_smu_eeprom_init(struct amdgpu_ras_eeprom_control *control)
1517	{
1518	struct amdgpu_device *adev = to_amdgpu_device(control);
1519	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
1520	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1521	uint64_t local_time;
1522	int res;
1523
1524	ras->is_rma = false;
1525
1526	if (!__is_ras_eeprom_supported(adev))
1527	return `0`;
1528	mutex_init(&control->ras_tbl_mutex);
1529
1530	res = amdgpu_ras_smu_get_table_version(adev, table_version: &(hdr->version));
1531	if (res)
1532	return res;
1533
1534	res = amdgpu_ras_smu_get_badpage_count(adev,
1535	count: &(control->ras_num_recs), timeout: `100`);
1536	if (res)
1537	return res;
1538
1539	local_time = (uint64_t)ktime_get_real_seconds();
1540	res = amdgpu_ras_smu_set_timestamp(adev, timestamp: local_time);
1541	if (res)
1542	return res;
1543
1544	control->ras_max_record_count = `4000`;
1545
1546	control->ras_num_mca_recs = `0`;
1547	control->ras_num_pa_recs = `0`;
1548
1549	return `0`;
1550	}
1551
1552	int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
1553	{
1554	struct amdgpu_device *adev = to_amdgpu_device(control);
1555	unsigned char buf[RAS_TABLE_HEADER_SIZE] = { `0` };
1556	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
1557	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1558	int res;
1559
1560	if (amdgpu_ras_smu_eeprom_supported(adev))
1561	return amdgpu_ras_smu_eeprom_init(control);
1562
1563	ras->is_rma = false;
1564
1565	if (!__is_ras_eeprom_supported(adev))
1566	return `0`;
1567
1568	/ Verify i2c adapter is initialized /
1569	if (!adev->pm.ras_eeprom_i2c_bus \|\| !adev->pm.ras_eeprom_i2c_bus->algo)
1570	return -ENOENT;
1571
1572	if (!__get_eeprom_i2c_addr(adev, control))
1573	return -EINVAL;
1574
1575	control->ras_header_offset = RAS_HDR_START;
1576	control->ras_info_offset = RAS_TABLE_V2_1_INFO_START;
1577	mutex_init(&control->ras_tbl_mutex);
1578
1579	/ Read the table header from EEPROM address /
1580	res = amdgpu_eeprom_read(i2c_adap: adev->pm.ras_eeprom_i2c_bus,
1581	eeprom_addr: control->i2c_address + control->ras_header_offset,
1582	eeprom_buf: buf, RAS_TABLE_HEADER_SIZE);
1583	if (res < RAS_TABLE_HEADER_SIZE) {
1584	dev_err(adev->dev, "Failed to read EEPROM table header, res:%d",
1585	res);
1586	return res >= `0` ? -EIO : res;
1587	}
1588
1589	__decode_table_header_from_buf(hdr, buf);
1590
1591	if (hdr->header != RAS_TABLE_HDR_VAL &&
1592	hdr->header != RAS_TABLE_HDR_BAD) {
1593	dev_info(adev->dev, "Creating a new EEPROM table");
1594	return amdgpu_ras_eeprom_reset_table(control);
1595	}
1596
1597	switch (hdr->version) {
1598	case RAS_TABLE_VER_V2_1:
1599	case RAS_TABLE_VER_V3:
1600	control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
1601	control->ras_record_offset = RAS_RECORD_START_V2_1;
1602	control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
1603	break;
1604	case RAS_TABLE_VER_V1:
1605	control->ras_num_recs = RAS_NUM_RECS(hdr);
1606	control->ras_record_offset = RAS_RECORD_START;
1607	control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
1608	break;
1609	default:
1610	dev_err(adev->dev,
1611	"RAS header invalid, unsupported version: %u",
1612	hdr->version);
1613	return -EINVAL;
1614	}
1615
1616	if (control->ras_num_recs > control->ras_max_record_count) {
1617	dev_err(adev->dev,
1618	"RAS header invalid, records in header: %u max allowed :%u",
1619	control->ras_num_recs, control->ras_max_record_count);
1620	return -EINVAL;
1621	}
1622
1623	control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
1624	control->ras_num_mca_recs = `0`;
1625	control->ras_num_pa_recs = `0`;
1626	return `0`;
1627	}
1628
1629	static int amdgpu_ras_smu_eeprom_check(struct amdgpu_ras_eeprom_control *control)
1630	{
1631	struct amdgpu_device *adev = to_amdgpu_device(control);
1632	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1633
1634	if (!__is_ras_eeprom_supported(adev))
1635	return `0`;
1636
1637	control->ras_num_bad_pages = ras->bad_page_num;
1638
1639	if ((ras->bad_page_cnt_threshold < control->ras_num_bad_pages) &&
1640	amdgpu_bad_page_threshold != `0`) {
1641	dev_warn(adev->dev,
1642	"RAS records:%d exceed threshold:%d\n",
1643	control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
1644	if ((amdgpu_bad_page_threshold == -`1`) \|\|
1645	(amdgpu_bad_page_threshold == -`2`)) {
1646	dev_warn(adev->dev,
1647	"Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
1648	} else {
1649	ras->is_rma = true;
1650	dev_warn(adev->dev,
1651	"User defined threshold is set, runtime service will be halt when threshold is reached\n");
1652	}
1653
1654	return `0`;
1655	}
1656
1657	dev_dbg(adev->dev,
1658	"Found existing EEPROM table with %d records",
1659	control->ras_num_bad_pages);
1660
1661	/ Warn if we are at 90% of the threshold or above*
1662	*/
1663	if (`10` * control->ras_num_bad_pages >= `9` * ras->bad_page_cnt_threshold)
1664	dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
1665	control->ras_num_bad_pages,
1666	ras->bad_page_cnt_threshold);
1667	return `0`;
1668	}
1669
1670	int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
1671	{
1672	struct amdgpu_device *adev = to_amdgpu_device(control);
1673	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
1674	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1675	int res = `0`;
1676
1677	if (amdgpu_ras_smu_eeprom_supported(adev))
1678	return amdgpu_ras_smu_eeprom_check(control);
1679
1680	if (!__is_ras_eeprom_supported(adev))
1681	return `0`;
1682
1683	/ Verify i2c adapter is initialized /
1684	if (!adev->pm.ras_eeprom_i2c_bus \|\| !adev->pm.ras_eeprom_i2c_bus->algo)
1685	return -ENOENT;
1686
1687	if (!__get_eeprom_i2c_addr(adev, control))
1688	return -EINVAL;
1689
1690	control->ras_num_bad_pages = ras->bad_page_num;
1691
1692	if (hdr->header == RAS_TABLE_HDR_VAL) {
1693	dev_dbg(adev->dev,
1694	"Found existing EEPROM table with %d records",
1695	control->ras_num_bad_pages);
1696
1697	if (hdr->version >= RAS_TABLE_VER_V2_1) {
1698	res = __read_table_ras_info(control);
1699	if (res)
1700	return res;
1701	}
1702
1703	res = __verify_ras_table_checksum(control);
1704	if (res)
1705	dev_err(adev->dev,
1706	"RAS table incorrect checksum or error:%d\n",
1707	res);
1708
1709	/ Warn if we are at 90% of the threshold or above*
1710	*/
1711	if (`10` * control->ras_num_bad_pages >= `9` * ras->bad_page_cnt_threshold)
1712	dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
1713	control->ras_num_bad_pages,
1714	ras->bad_page_cnt_threshold);
1715	} else if (hdr->header == RAS_TABLE_HDR_BAD &&
1716	amdgpu_bad_page_threshold != `0`) {
1717	if (hdr->version >= RAS_TABLE_VER_V2_1) {
1718	res = __read_table_ras_info(control);
1719	if (res)
1720	return res;
1721	}
1722
1723	res = __verify_ras_table_checksum(control);
1724	if (res) {
1725	dev_err(adev->dev,
1726	"RAS Table incorrect checksum or error:%d\n",
1727	res);
1728	return -EINVAL;
1729	}
1730	if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) {
1731	/ This means that, the threshold was increased since*
1732	* the last time the system was booted, and now,
1733	* ras->bad_page_cnt_threshold - control->num_recs > 0,
1734	* so that at least one more record can be saved,
1735	* before the page count threshold is reached.
1736	*/
1737	dev_info(adev->dev,
1738	"records:%d threshold:%d, resetting "
1739	"RAS table header signature",
1740	control->ras_num_bad_pages,
1741	ras->bad_page_cnt_threshold);
1742	res = amdgpu_ras_eeprom_correct_header_tag(control,
1743	RAS_TABLE_HDR_VAL);
1744	} else {
1745	dev_warn(adev->dev,
1746	"RAS records:%d exceed threshold:%d\n",
1747	control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
1748	if ((amdgpu_bad_page_threshold == -`1`) \|\|
1749	(amdgpu_bad_page_threshold == -`2`)) {
1750	res = `0`;
1751	dev_warn(adev->dev,
1752	"Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
1753	} else {
1754	ras->is_rma = true;
1755	dev_warn(adev->dev,
1756	"User defined threshold is set, runtime service will be halt when threshold is reached\n");
1757	}
1758	}
1759	}
1760
1761	return res < `0` ? res : `0`;
1762	}
1763
1764	void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev)
1765	{
1766	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1767	struct amdgpu_ras_eeprom_control *control;
1768	int res;
1769
1770	if (!__is_ras_eeprom_supported(adev) \|\| !ras \|\|
1771	amdgpu_ras_smu_eeprom_supported(adev))
1772	return;
1773	control = &ras->eeprom_control;
1774	if (!control->is_eeprom_valid)
1775	return;
1776	res = __verify_ras_table_checksum(control);
1777	if (res) {
1778	dev_warn(adev->dev,
1779	"RAS table incorrect checksum or error:%d, try to recover\n",
1780	res);
1781	if (!amdgpu_ras_eeprom_reset_table(control))
1782	if (!amdgpu_ras_save_bad_pages(adev, NULL))
1783	if (!__verify_ras_table_checksum(control)) {
1784	dev_info(adev->dev, "RAS table recovery succeed\n");
1785	return;
1786	}
1787	dev_err(adev->dev, "RAS table recovery failed\n");
1788	control->is_eeprom_valid = false;
1789	}
1790	return;
1791	}
1792
1793	static const struct ras_smu_drv amdgpu_ras_get_smu_ras_drv(struct* amdgpu_device *adev)
1794	{
1795	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1796
1797	if (!ras)
1798	return NULL;
1799
1800	return ras->ras_smu_drv;
1801	}
1802
1803	static uint64_t amdgpu_ras_smu_get_feature_flags(struct amdgpu_device *adev)
1804	{
1805	const struct ras_smu_drv *ras_smu_drv = amdgpu_ras_get_smu_ras_drv(adev);
1806	uint64_t flags = `0ULL`;
1807
1808	if (!ras_smu_drv)
1809	goto out;
1810
1811	if (ras_smu_drv->ras_smu_feature_flags)
1812	ras_smu_drv->ras_smu_feature_flags(adev, &flags);
1813
1814	out:
1815	return flags;
1816	}
1817
1818	bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev)
1819	{
1820	const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1821	uint64_t flags = `0ULL`;
1822
1823	if (!__is_ras_eeprom_supported(adev) \|\| !smu_ras_drv)
1824	return false;
1825
1826	if (!smu_ras_drv->smu_eeprom_funcs)
1827	return false;
1828
1829	flags = amdgpu_ras_smu_get_feature_flags(adev);
1830
1831	return !!(flags & RAS_SMU_FEATURE_BIT__RAS_EEPROM);
1832	}
1833
1834	int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev,
1835	uint32_t *table_version)
1836	{
1837	const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1838
1839	if (!amdgpu_ras_smu_eeprom_supported(adev))
1840	return -EOPNOTSUPP;
1841
1842	if (smu_ras_drv->smu_eeprom_funcs->get_ras_table_version)
1843	return smu_ras_drv->smu_eeprom_funcs->get_ras_table_version(adev,
1844	table_version);
1845	return -EOPNOTSUPP;
1846	}
1847
1848	int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev,
1849	uint32_t *count, uint32_t timeout)
1850	{
1851	const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1852
1853	if (!amdgpu_ras_smu_eeprom_supported(adev))
1854	return -EOPNOTSUPP;
1855
1856	if (smu_ras_drv->smu_eeprom_funcs->get_badpage_count)
1857	return smu_ras_drv->smu_eeprom_funcs->get_badpage_count(adev,
1858	count, timeout);
1859	return -EOPNOTSUPP;
1860	}
1861
1862	int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev,
1863	uint16_t index, uint64_t *mca_addr)
1864	{
1865	const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1866
1867	if (!amdgpu_ras_smu_eeprom_supported(adev))
1868	return -EOPNOTSUPP;
1869
1870	if (smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr)
1871	return smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr(adev,
1872	index, mca_addr);
1873	return -EOPNOTSUPP;
1874	}
1875
1876	int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev,
1877	uint64_t timestamp)
1878	{
1879	const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1880
1881	if (!amdgpu_ras_smu_eeprom_supported(adev))
1882	return -EOPNOTSUPP;
1883
1884	if (smu_ras_drv->smu_eeprom_funcs->set_timestamp)
1885	return smu_ras_drv->smu_eeprom_funcs->set_timestamp(adev,
1886	timestamp);
1887	return -EOPNOTSUPP;
1888	}
1889
1890	int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev,
1891	uint16_t index, uint64_t *timestamp)
1892	{
1893	const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1894
1895	if (!amdgpu_ras_smu_eeprom_supported(adev))
1896	return -EOPNOTSUPP;
1897
1898	if (smu_ras_drv->smu_eeprom_funcs->get_timestamp)
1899	return smu_ras_drv->smu_eeprom_funcs->get_timestamp(adev,
1900	index, timestamp);
1901	return -EOPNOTSUPP;
1902	}
1903
1904	int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev,
1905	uint16_t index, uint64_t *ipid)
1906	{
1907	const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1908
1909	if (!amdgpu_ras_smu_eeprom_supported(adev))
1910	return -EOPNOTSUPP;
1911
1912	if (smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid)
1913	return smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid(adev,
1914	index, ipid);
1915	return -EOPNOTSUPP;
1916	}
1917
1918	int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
1919	uint32_t *result)
1920	{
1921	const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1922
1923	if (!amdgpu_ras_smu_eeprom_supported(adev))
1924	return -EOPNOTSUPP;
1925
1926	if (smu_ras_drv->smu_eeprom_funcs->erase_ras_table)
1927	return smu_ras_drv->smu_eeprom_funcs->erase_ras_table(adev,
1928	result);
1929	return -EOPNOTSUPP;
1930	}
1931

source code of linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c