1/*
2 * Copyright 2019 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24#ifndef _AMDGPU_RAS_EEPROM_H
25#define _AMDGPU_RAS_EEPROM_H
26
27#include <linux/i2c.h>
28
29#define RAS_TABLE_VER_V1 0x00010000
30#define RAS_TABLE_VER_V2_1 0x00021000
31#define RAS_TABLE_VER_V3 0x00030000
32
33struct amdgpu_device;
34
35enum amdgpu_ras_gpu_health_status {
36 GPU_HEALTH_USABLE = 0,
37 GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
38};
39
40enum amdgpu_ras_eeprom_err_type {
41 AMDGPU_RAS_EEPROM_ERR_NA,
42 AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,
43 AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE,
44 AMDGPU_RAS_EEPROM_ERR_COUNT,
45};
46
47struct amdgpu_ras_eeprom_table_header {
48 uint32_t header;
49 uint32_t version;
50 uint32_t first_rec_offset;
51 uint32_t tbl_size;
52 uint32_t checksum;
53} __packed;
54
55struct amdgpu_ras_eeprom_table_ras_info {
56 u8 rma_status;
57 u8 health_percent;
58 u16 ecc_page_threshold;
59 u32 padding[64 - 1];
60} __packed;
61
62struct amdgpu_ras_eeprom_control {
63 struct amdgpu_ras_eeprom_table_header tbl_hdr;
64
65 struct amdgpu_ras_eeprom_table_ras_info tbl_rai;
66
67 /* Base I2C EEPPROM 19-bit memory address,
68 * where the table is located. For more information,
69 * see top of amdgpu_eeprom.c.
70 */
71 u32 i2c_address;
72
73 /* The byte offset off of @i2c_address
74 * where the table header is found,
75 * and where the records start--always
76 * right after the header.
77 */
78 u32 ras_header_offset;
79 u32 ras_info_offset;
80 u32 ras_record_offset;
81
82 /* Number of records in the table.
83 */
84 u32 ras_num_recs;
85 u32 ras_num_recs_old;
86
87 /* the bad page number is ras_num_recs or
88 * ras_num_recs * umc.retire_unit
89 */
90 u32 ras_num_bad_pages;
91
92 /* Number of records store mca address */
93 u32 ras_num_mca_recs;
94
95 /* Number of records store physical address */
96 u32 ras_num_pa_recs;
97
98 /* First record index to read, 0-based.
99 * Range is [0, num_recs-1]. This is
100 * an absolute index, starting right after
101 * the table header.
102 */
103 u32 ras_fri;
104
105 /* Maximum possible number of records
106 * we could store, i.e. the maximum capacity
107 * of the table.
108 */
109 u32 ras_max_record_count;
110
111 /* Protect table access via this mutex.
112 */
113 struct mutex ras_tbl_mutex;
114
115 /* Record channel info which occurred bad pages
116 */
117 u32 bad_channel_bitmap;
118
119 bool is_eeprom_valid;
120};
121
122/*
123 * Represents single table record. Packed to be easily serialized into byte
124 * stream.
125 */
126struct eeprom_table_record {
127
128 union {
129 uint64_t address;
130 uint64_t offset;
131 };
132
133 uint64_t retired_page;
134 uint64_t ts;
135
136 enum amdgpu_ras_eeprom_err_type err_type;
137
138 union {
139 unsigned char bank;
140 unsigned char cu;
141 };
142
143 unsigned char mem_channel;
144 unsigned char mcumc_id;
145} __packed;
146
147int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
148
149int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
150
151bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev);
152
153int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
154 struct eeprom_table_record *records, const u32 num);
155
156int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
157 struct eeprom_table_record *records, const u32 num);
158
159uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control);
160
161void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
162
163int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control);
164
165void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev);
166
167bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev);
168
169int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev,
170 uint32_t *table_version);
171
172int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev,
173 uint32_t *count, uint32_t timeout);
174
175int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev,
176 uint16_t index, uint64_t *mca_addr);
177
178int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev,
179 uint64_t timestamp);
180
181int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev,
182 uint16_t index, uint64_t *timestamp);
183
184int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev,
185 uint16_t index, uint64_t *ipid);
186
187int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
188 uint32_t *result);
189
190int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
191 struct eeprom_table_record *record, u32 rec_idx,
192 const u32 num);
193
194int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control);
195
196extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
197extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
198
199#endif // _AMDGPU_RAS_EEPROM_H
200

source code of linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h