| 1 | /* |
| 2 | * Copyright 2023 Advanced Micro Devices, Inc. |
| 3 | * |
| 4 | * Permission is hereby granted, free of charge, to any person obtaining a |
| 5 | * copy of this software and associated documentation files (the "Software"), |
| 6 | * to deal in the Software without restriction, including without limitation |
| 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 8 | * and/or sell copies of the Software, and to permit persons to whom the |
| 9 | * Software is furnished to do so, subject to the following conditions: |
| 10 | * |
| 11 | * The above copyright notice and this permission notice shall be included in |
| 12 | * all copies or substantial portions of the Software. |
| 13 | * |
| 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| 17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| 18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| 19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 20 | * OTHER DEALINGS IN THE SOFTWARE. |
| 21 | * |
| 22 | */ |
| 23 | #include "umc_v12_0.h" |
| 24 | #include "amdgpu_ras.h" |
| 25 | #include "amdgpu_umc.h" |
| 26 | #include "amdgpu.h" |
| 27 | #include "umc/umc_12_0_0_offset.h" |
| 28 | #include "umc/umc_12_0_0_sh_mask.h" |
| 29 | #include "mp/mp_13_0_6_sh_mask.h" |
| 30 | |
| 31 | #define MAX_ECC_NUM_PER_RETIREMENT 32 |
| 32 | #define DELAYED_TIME_FOR_GPU_RESET 1000 //ms |
| 33 | |
| 34 | static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, |
| 35 | uint32_t node_inst, |
| 36 | uint32_t umc_inst, |
| 37 | uint32_t ch_inst) |
| 38 | { |
| 39 | uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst; |
| 40 | uint64_t cross_node_offset = (node_inst == 0) ? 0 : UMC_V12_0_CROSS_NODE_OFFSET; |
| 41 | |
| 42 | umc_inst = index / 4; |
| 43 | ch_inst = index % 4; |
| 44 | |
| 45 | return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST * umc_inst + |
| 46 | UMC_V12_0_NODE_DIST * node_inst + cross_node_offset; |
| 47 | } |
| 48 | |
| 49 | static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device *adev, |
| 50 | uint32_t node_inst, uint32_t umc_inst, |
| 51 | uint32_t ch_inst, void *data) |
| 52 | { |
| 53 | uint64_t odecc_err_cnt_addr; |
| 54 | uint64_t umc_reg_offset = |
| 55 | get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); |
| 56 | |
| 57 | odecc_err_cnt_addr = |
| 58 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt); |
| 59 | |
| 60 | /* clear error count */ |
| 61 | WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, |
| 62 | UMC_V12_0_CE_CNT_INIT); |
| 63 | |
| 64 | return 0; |
| 65 | } |
| 66 | |
| 67 | static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) |
| 68 | { |
| 69 | amdgpu_umc_loop_channels(adev, |
| 70 | func: umc_v12_0_reset_error_count_per_channel, NULL); |
| 71 | } |
| 72 | |
| 73 | bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status) |
| 74 | { |
| 75 | dev_dbg(adev->dev, |
| 76 | "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, PCC:%llu, UC:%llu, TCC:%llu\n" , |
| 77 | mc_umc_status, |
| 78 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val), |
| 79 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Poison), |
| 80 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred), |
| 81 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC), |
| 82 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC), |
| 83 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) |
| 84 | ); |
| 85 | |
| 86 | return (amdgpu_ras_is_poison_mode_supported(adev) && |
| 87 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && |
| 88 | ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1) || |
| 89 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Poison) == 1))); |
| 90 | } |
| 91 | |
| 92 | bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status) |
| 93 | { |
| 94 | if (umc_v12_0_is_deferred_error(adev, mc_umc_status)) |
| 95 | return false; |
| 96 | |
| 97 | return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && |
| 98 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || |
| 99 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || |
| 100 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)); |
| 101 | } |
| 102 | |
| 103 | bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status) |
| 104 | { |
| 105 | if (umc_v12_0_is_deferred_error(adev, mc_umc_status)) |
| 106 | return false; |
| 107 | |
| 108 | return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && |
| 109 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1 || |
| 110 | (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 && |
| 111 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 0) || |
| 112 | /* Identify data parity error in replay mode */ |
| 113 | ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 0x5 || |
| 114 | REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 0xb) && |
| 115 | !(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status))))); |
| 116 | } |
| 117 | |
| 118 | static void umc_v12_0_query_error_count_per_type(struct amdgpu_device *adev, |
| 119 | uint64_t umc_reg_offset, |
| 120 | unsigned long *error_count, |
| 121 | check_error_type_func error_type_func) |
| 122 | { |
| 123 | uint64_t mc_umc_status; |
| 124 | uint64_t mc_umc_status_addr; |
| 125 | |
| 126 | mc_umc_status_addr = |
| 127 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); |
| 128 | |
| 129 | /* Check MCUMC_STATUS */ |
| 130 | mc_umc_status = |
| 131 | RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); |
| 132 | |
| 133 | if (error_type_func(adev, mc_umc_status)) |
| 134 | *error_count += 1; |
| 135 | } |
| 136 | |
| 137 | static int umc_v12_0_query_error_count(struct amdgpu_device *adev, |
| 138 | uint32_t node_inst, uint32_t umc_inst, |
| 139 | uint32_t ch_inst, void *data) |
| 140 | { |
| 141 | struct ras_err_data *err_data = (struct ras_err_data *)data; |
| 142 | unsigned long ue_count = 0, ce_count = 0, de_count = 0; |
| 143 | |
| 144 | /* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3], |
| 145 | * which can be used as die ID directly */ |
| 146 | struct amdgpu_smuio_mcm_config_info mcm_info = { |
| 147 | .socket_id = adev->smuio.funcs->get_socket_id(adev), |
| 148 | .die_id = node_inst, |
| 149 | }; |
| 150 | |
| 151 | uint64_t umc_reg_offset = |
| 152 | get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); |
| 153 | |
| 154 | umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, |
| 155 | error_count: &ce_count, error_type_func: umc_v12_0_is_correctable_error); |
| 156 | umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, |
| 157 | error_count: &ue_count, error_type_func: umc_v12_0_is_uncorrectable_error); |
| 158 | umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, |
| 159 | error_count: &de_count, error_type_func: umc_v12_0_is_deferred_error); |
| 160 | |
| 161 | amdgpu_ras_error_statistic_ue_count(err_data, mcm_info: &mcm_info, count: ue_count); |
| 162 | amdgpu_ras_error_statistic_ce_count(err_data, mcm_info: &mcm_info, count: ce_count); |
| 163 | amdgpu_ras_error_statistic_de_count(err_data, mcm_info: &mcm_info, count: de_count); |
| 164 | |
| 165 | return 0; |
| 166 | } |
| 167 | |
| 168 | static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev, |
| 169 | void *ras_error_status) |
| 170 | { |
| 171 | amdgpu_umc_loop_channels(adev, |
| 172 | func: umc_v12_0_query_error_count, data: ras_error_status); |
| 173 | |
| 174 | umc_v12_0_reset_error_count(adev); |
| 175 | } |
| 176 | |
| 177 | static void umc_v12_0_get_retire_flip_bits(struct amdgpu_device *adev) |
| 178 | { |
| 179 | enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; |
| 180 | uint32_t vram_type = adev->gmc.vram_type; |
| 181 | struct amdgpu_umc_flip_bits *flip_bits = &(adev->umc.flip_bits); |
| 182 | |
| 183 | if (adev->gmc.gmc_funcs->query_mem_partition_mode) |
| 184 | nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); |
| 185 | |
| 186 | /* default setting */ |
| 187 | flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_C2_BIT; |
| 188 | flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C3_BIT; |
| 189 | flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_C4_BIT; |
| 190 | flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R13_BIT; |
| 191 | flip_bits->flip_row_bit = 13; |
| 192 | flip_bits->bit_num = 4; |
| 193 | flip_bits->r13_in_pa = UMC_V12_0_PA_R13_BIT; |
| 194 | |
| 195 | if (nps == AMDGPU_NPS2_PARTITION_MODE) { |
| 196 | flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH5_BIT; |
| 197 | flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C2_BIT; |
| 198 | flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B1_BIT; |
| 199 | flip_bits->r13_in_pa = UMC_V12_0_PA_R12_BIT; |
| 200 | } else if (nps == AMDGPU_NPS4_PARTITION_MODE) { |
| 201 | flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH4_BIT; |
| 202 | flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_CH5_BIT; |
| 203 | flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B0_BIT; |
| 204 | flip_bits->r13_in_pa = UMC_V12_0_PA_R11_BIT; |
| 205 | } |
| 206 | |
| 207 | switch (vram_type) { |
| 208 | case AMDGPU_VRAM_TYPE_HBM: |
| 209 | /* other nps modes are taken as nps1 */ |
| 210 | if (nps == AMDGPU_NPS2_PARTITION_MODE) |
| 211 | flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT; |
| 212 | else if (nps == AMDGPU_NPS4_PARTITION_MODE) |
| 213 | flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT; |
| 214 | |
| 215 | break; |
| 216 | case AMDGPU_VRAM_TYPE_HBM3E: |
| 217 | flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT; |
| 218 | flip_bits->flip_row_bit = 12; |
| 219 | |
| 220 | if (nps == AMDGPU_NPS2_PARTITION_MODE) |
| 221 | flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT; |
| 222 | else if (nps == AMDGPU_NPS4_PARTITION_MODE) |
| 223 | flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R10_BIT; |
| 224 | |
| 225 | break; |
| 226 | default: |
| 227 | dev_warn(adev->dev, |
| 228 | "Unknown HBM type, set RAS retire flip bits to the value in NPS1 mode.\n" ); |
| 229 | break; |
| 230 | } |
| 231 | |
| 232 | adev->umc.retire_unit = 0x1 << flip_bits->bit_num; |
| 233 | } |
| 234 | |
| 235 | static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, |
| 236 | struct ras_err_data *err_data, |
| 237 | struct ta_ras_query_address_input *addr_in, |
| 238 | struct ta_ras_query_address_output *addr_out, |
| 239 | bool dump_addr) |
| 240 | { |
| 241 | uint32_t col, col_lower, row, row_lower, row_high, bank; |
| 242 | uint32_t channel_index = 0, umc_inst = 0; |
| 243 | uint32_t i, bit_num, retire_unit, *flip_bits; |
| 244 | uint64_t soc_pa, column, err_addr; |
| 245 | struct ta_ras_query_address_output addr_out_tmp; |
| 246 | struct ta_ras_query_address_output *paddr_out; |
| 247 | int ret = 0; |
| 248 | |
| 249 | if (!addr_out) |
| 250 | paddr_out = &addr_out_tmp; |
| 251 | else |
| 252 | paddr_out = addr_out; |
| 253 | |
| 254 | err_addr = bank = 0; |
| 255 | if (addr_in) { |
| 256 | err_addr = addr_in->ma.err_addr; |
| 257 | addr_in->addr_type = TA_RAS_MCA_TO_PA; |
| 258 | ret = psp_ras_query_address(psp: &adev->psp, addr_in, addr_out: paddr_out); |
| 259 | if (ret) { |
| 260 | dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx" , |
| 261 | err_addr); |
| 262 | |
| 263 | goto out; |
| 264 | } |
| 265 | |
| 266 | bank = paddr_out->pa.bank; |
| 267 | /* no need to care about umc inst if addr_in is NULL */ |
| 268 | umc_inst = addr_in->ma.umc_inst; |
| 269 | } |
| 270 | |
| 271 | flip_bits = adev->umc.flip_bits.flip_bits_in_pa; |
| 272 | bit_num = adev->umc.flip_bits.bit_num; |
| 273 | retire_unit = adev->umc.retire_unit; |
| 274 | |
| 275 | soc_pa = paddr_out->pa.pa; |
| 276 | channel_index = paddr_out->pa.channel_idx; |
| 277 | /* clear loop bits in soc physical address */ |
| 278 | for (i = 0; i < bit_num; i++) |
| 279 | soc_pa &= ~BIT_ULL(flip_bits[i]); |
| 280 | |
| 281 | paddr_out->pa.pa = soc_pa; |
| 282 | /* get column bit 0 and 1 in mca address */ |
| 283 | col_lower = (err_addr >> 1) & 0x3ULL; |
| 284 | /* extra row bit will be handled later */ |
| 285 | row_lower = (err_addr >> UMC_V12_0_MA_R0_BIT) & 0x1fffULL; |
| 286 | row_lower &= ~BIT_ULL(adev->umc.flip_bits.flip_row_bit); |
| 287 | |
| 288 | if (amdgpu_ip_version(adev, ip: GC_HWIP, inst: 0) >= IP_VERSION(9, 5, 0)) { |
| 289 | row_high = (soc_pa >> adev->umc.flip_bits.r13_in_pa) & 0x3ULL; |
| 290 | /* it's 2.25GB in each channel, from MCA address to PA |
| 291 | * [R14 R13] is converted if the two bits value are 0x3, |
| 292 | * get them from PA instead of MCA address. |
| 293 | */ |
| 294 | row_lower |= (row_high << 13); |
| 295 | } |
| 296 | |
| 297 | if (!err_data && !dump_addr) |
| 298 | goto out; |
| 299 | |
| 300 | /* loop for all possibilities of retired bits */ |
| 301 | for (column = 0; column < retire_unit; column++) { |
| 302 | soc_pa = paddr_out->pa.pa; |
| 303 | for (i = 0; i < bit_num; i++) |
| 304 | soc_pa |= (((column >> i) & 0x1ULL) << flip_bits[i]); |
| 305 | |
| 306 | col = ((column & 0x7) << 2) | col_lower; |
| 307 | /* handle extra row bit */ |
| 308 | if (bit_num == RETIRE_FLIP_BITS_NUM) |
| 309 | row = ((column >> 3) << adev->umc.flip_bits.flip_row_bit) | |
| 310 | row_lower; |
| 311 | |
| 312 | if (dump_addr) |
| 313 | dev_info(adev->dev, |
| 314 | "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n" , |
| 315 | soc_pa, row, col, bank, channel_index); |
| 316 | |
| 317 | if (err_data) |
| 318 | amdgpu_umc_fill_error_record(err_data, err_addr, |
| 319 | retired_page: soc_pa, channel_index, umc_inst); |
| 320 | } |
| 321 | |
| 322 | out: |
| 323 | return ret; |
| 324 | } |
| 325 | |
| 326 | static int umc_v12_0_query_error_address(struct amdgpu_device *adev, |
| 327 | uint32_t node_inst, uint32_t umc_inst, |
| 328 | uint32_t ch_inst, void *data) |
| 329 | { |
| 330 | struct ras_err_data *err_data = (struct ras_err_data *)data; |
| 331 | struct ta_ras_query_address_input addr_in; |
| 332 | uint64_t mc_umc_status_addr; |
| 333 | uint64_t mc_umc_status, err_addr; |
| 334 | uint64_t mc_umc_addrt0; |
| 335 | uint64_t umc_reg_offset = |
| 336 | get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); |
| 337 | |
| 338 | mc_umc_status_addr = |
| 339 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); |
| 340 | |
| 341 | mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); |
| 342 | |
| 343 | if (mc_umc_status == 0) |
| 344 | return 0; |
| 345 | |
| 346 | if (!err_data->err_addr) { |
| 347 | /* clear umc status */ |
| 348 | WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); |
| 349 | |
| 350 | return 0; |
| 351 | } |
| 352 | |
| 353 | /* calculate error address if ue error is detected */ |
| 354 | if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) || |
| 355 | umc_v12_0_is_deferred_error(adev, mc_umc_status)) { |
| 356 | mc_umc_addrt0 = |
| 357 | SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); |
| 358 | |
| 359 | err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 + umc_reg_offset) * 4); |
| 360 | |
| 361 | err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); |
| 362 | |
| 363 | if (!adev->aid_mask && |
| 364 | adev->smuio.funcs && |
| 365 | adev->smuio.funcs->get_socket_id) |
| 366 | addr_in.ma.socket_id = adev->smuio.funcs->get_socket_id(adev); |
| 367 | else |
| 368 | addr_in.ma.socket_id = 0; |
| 369 | |
| 370 | addr_in.ma.err_addr = err_addr; |
| 371 | addr_in.ma.ch_inst = ch_inst; |
| 372 | addr_in.ma.umc_inst = umc_inst; |
| 373 | addr_in.ma.node_inst = node_inst; |
| 374 | |
| 375 | umc_v12_0_convert_error_address(adev, err_data, addr_in: &addr_in, NULL, dump_addr: true); |
| 376 | } |
| 377 | |
| 378 | /* clear umc status */ |
| 379 | WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); |
| 380 | |
| 381 | return 0; |
| 382 | } |
| 383 | |
| 384 | static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev, |
| 385 | void *ras_error_status) |
| 386 | { |
| 387 | amdgpu_umc_loop_channels(adev, |
| 388 | func: umc_v12_0_query_error_address, data: ras_error_status); |
| 389 | } |
| 390 | |
| 391 | static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev, |
| 392 | uint32_t node_inst, uint32_t umc_inst, |
| 393 | uint32_t ch_inst, void *data) |
| 394 | { |
| 395 | uint32_t odecc_cnt_sel; |
| 396 | uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr; |
| 397 | uint64_t umc_reg_offset = |
| 398 | get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); |
| 399 | |
| 400 | odecc_cnt_sel_addr = |
| 401 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccCntSel); |
| 402 | odecc_err_cnt_addr = |
| 403 | SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt); |
| 404 | |
| 405 | odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4); |
| 406 | |
| 407 | /* set ce error interrupt type to APIC based interrupt */ |
| 408 | odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel, |
| 409 | OdEccErrInt, 0x1); |
| 410 | WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4, odecc_cnt_sel); |
| 411 | |
| 412 | /* set error count to initial value */ |
| 413 | WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V12_0_CE_CNT_INIT); |
| 414 | |
| 415 | return 0; |
| 416 | } |
| 417 | |
| 418 | static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev, |
| 419 | enum amdgpu_mca_error_type type, void *ras_error_status) |
| 420 | { |
| 421 | uint64_t mc_umc_status = *(uint64_t *)ras_error_status; |
| 422 | |
| 423 | switch (type) { |
| 424 | case AMDGPU_MCA_ERROR_TYPE_UE: |
| 425 | return umc_v12_0_is_uncorrectable_error(adev, mc_umc_status); |
| 426 | case AMDGPU_MCA_ERROR_TYPE_CE: |
| 427 | return umc_v12_0_is_correctable_error(adev, mc_umc_status); |
| 428 | case AMDGPU_MCA_ERROR_TYPE_DE: |
| 429 | return umc_v12_0_is_deferred_error(adev, mc_umc_status); |
| 430 | default: |
| 431 | return false; |
| 432 | } |
| 433 | |
| 434 | return false; |
| 435 | } |
| 436 | |
| 437 | static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) |
| 438 | { |
| 439 | amdgpu_umc_loop_channels(adev, |
| 440 | func: umc_v12_0_err_cnt_init_per_channel, NULL); |
| 441 | } |
| 442 | |
| 443 | static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev) |
| 444 | { |
| 445 | /* |
| 446 | * Force return true, because regUMCCH0_EccCtrl |
| 447 | * is not accessible from host side |
| 448 | */ |
| 449 | return true; |
| 450 | } |
| 451 | |
| 452 | const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = { |
| 453 | .query_ras_error_count = umc_v12_0_query_ras_error_count, |
| 454 | .query_ras_error_address = umc_v12_0_query_ras_error_address, |
| 455 | }; |
| 456 | |
| 457 | static int umc_v12_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, |
| 458 | enum aca_smu_type type, void *data) |
| 459 | { |
| 460 | struct amdgpu_device *adev = handle->adev; |
| 461 | struct aca_bank_info info; |
| 462 | enum aca_error_type err_type; |
| 463 | u64 status, count; |
| 464 | u32 ext_error_code; |
| 465 | int ret; |
| 466 | |
| 467 | status = bank->regs[ACA_REG_IDX_STATUS]; |
| 468 | if (umc_v12_0_is_deferred_error(adev, mc_umc_status: status)) |
| 469 | err_type = ACA_ERROR_TYPE_DEFERRED; |
| 470 | else if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status: status)) |
| 471 | err_type = ACA_ERROR_TYPE_UE; |
| 472 | else if (umc_v12_0_is_correctable_error(adev, mc_umc_status: status)) |
| 473 | err_type = ACA_ERROR_TYPE_CE; |
| 474 | else |
| 475 | return 0; |
| 476 | bank->aca_err_type = err_type; |
| 477 | |
| 478 | ret = aca_bank_info_decode(bank, info: &info); |
| 479 | if (ret) |
| 480 | return ret; |
| 481 | |
| 482 | amdgpu_umc_update_ecc_status(adev, |
| 483 | status: bank->regs[ACA_REG_IDX_STATUS], |
| 484 | ipid: bank->regs[ACA_REG_IDX_IPID], |
| 485 | addr: bank->regs[ACA_REG_IDX_ADDR]); |
| 486 | |
| 487 | ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); |
| 488 | if (umc_v12_0_is_deferred_error(adev, mc_umc_status: status)) |
| 489 | count = ext_error_code == 0 ? |
| 490 | adev->umc.err_addr_cnt / adev->umc.retire_unit : 1ULL; |
| 491 | else |
| 492 | count = ext_error_code == 0 ? |
| 493 | ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]) : 1ULL; |
| 494 | |
| 495 | return aca_error_cache_log_bank_error(handle, info: &info, type: err_type, count); |
| 496 | } |
| 497 | |
| 498 | static const struct aca_bank_ops umc_v12_0_aca_bank_ops = { |
| 499 | .aca_bank_parser = umc_v12_0_aca_bank_parser, |
| 500 | }; |
| 501 | |
| 502 | const struct aca_info umc_v12_0_aca_info = { |
| 503 | .hwip = ACA_HWIP_TYPE_UMC, |
| 504 | .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK | ACA_ERROR_DEFERRED_MASK, |
| 505 | .bank_ops = &umc_v12_0_aca_bank_ops, |
| 506 | }; |
| 507 | |
| 508 | static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) |
| 509 | { |
| 510 | int ret; |
| 511 | |
| 512 | ret = amdgpu_umc_ras_late_init(adev, ras_block); |
| 513 | if (ret) |
| 514 | return ret; |
| 515 | |
| 516 | ret = amdgpu_ras_bind_aca(adev, blk: AMDGPU_RAS_BLOCK__UMC, |
| 517 | aca_info: &umc_v12_0_aca_info, NULL); |
| 518 | if (ret) |
| 519 | return ret; |
| 520 | |
| 521 | return 0; |
| 522 | } |
| 523 | |
| 524 | static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, |
| 525 | uint64_t status, uint64_t ipid, uint64_t addr) |
| 526 | { |
| 527 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
| 528 | uint16_t hwid, mcatype; |
| 529 | uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; |
| 530 | uint64_t err_addr, pa_addr = 0; |
| 531 | struct ras_ecc_err *ecc_err; |
| 532 | struct ta_ras_query_address_output addr_out; |
| 533 | uint32_t shift_bit = adev->umc.flip_bits.flip_bits_in_pa[2]; |
| 534 | int count, ret, i; |
| 535 | |
| 536 | hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); |
| 537 | mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); |
| 538 | |
| 539 | /* The IP block decode of consumption is SMU */ |
| 540 | if (hwid != MCA_UMC_HWID_V12_0 || mcatype != MCA_UMC_MCATYPE_V12_0) { |
| 541 | con->umc_ecc_log.consumption_q_count++; |
| 542 | return 0; |
| 543 | } |
| 544 | |
| 545 | if (!status) |
| 546 | return 0; |
| 547 | |
| 548 | if (!umc_v12_0_is_deferred_error(adev, mc_umc_status: status)) |
| 549 | return 0; |
| 550 | |
| 551 | err_addr = REG_GET_FIELD(addr, |
| 552 | MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); |
| 553 | |
| 554 | dev_dbg(adev->dev, |
| 555 | "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n" , |
| 556 | ipid, |
| 557 | MCA_IPID_2_SOCKET_ID(ipid), |
| 558 | MCA_IPID_2_DIE_ID(ipid), |
| 559 | MCA_IPID_2_UMC_INST(ipid), |
| 560 | MCA_IPID_2_UMC_CH(ipid), |
| 561 | err_addr); |
| 562 | |
| 563 | ret = amdgpu_umc_mca_to_addr(adev, |
| 564 | err_addr, MCA_IPID_2_UMC_CH(ipid), |
| 565 | MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid), |
| 566 | MCA_IPID_2_SOCKET_ID(ipid), addr_out: &addr_out, dump_addr: true); |
| 567 | if (ret) |
| 568 | return ret; |
| 569 | |
| 570 | ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL); |
| 571 | if (!ecc_err) |
| 572 | return -ENOMEM; |
| 573 | |
| 574 | pa_addr = addr_out.pa.pa; |
| 575 | ecc_err->status = status; |
| 576 | ecc_err->ipid = ipid; |
| 577 | ecc_err->addr = addr; |
| 578 | ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT; |
| 579 | ecc_err->channel_idx = addr_out.pa.channel_idx; |
| 580 | |
| 581 | /* If converted pa_pfn is 0, use pa C4 pfn. */ |
| 582 | if (!ecc_err->pa_pfn) |
| 583 | ecc_err->pa_pfn = BIT_ULL(shift_bit) >> AMDGPU_GPU_PAGE_SHIFT; |
| 584 | |
| 585 | ret = amdgpu_umc_logs_ecc_err(adev, ecc_tree: &con->umc_ecc_log.de_page_tree, ecc_err); |
| 586 | if (ret) { |
| 587 | if (ret == -EEXIST) |
| 588 | con->umc_ecc_log.de_queried_count++; |
| 589 | else |
| 590 | dev_err(adev->dev, "Fail to log ecc error! ret:%d\n" , ret); |
| 591 | |
| 592 | kfree(objp: ecc_err); |
| 593 | return ret; |
| 594 | } |
| 595 | |
| 596 | con->umc_ecc_log.de_queried_count++; |
| 597 | |
| 598 | memset(page_pfn, 0, sizeof(page_pfn)); |
| 599 | count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, |
| 600 | pa_addr, |
| 601 | pfns: page_pfn, ARRAY_SIZE(page_pfn)); |
| 602 | if (count <= 0) { |
| 603 | dev_warn(adev->dev, "Fail to convert error address! count:%d\n" , count); |
| 604 | return 0; |
| 605 | } |
| 606 | |
| 607 | /* Reserve memory */ |
| 608 | for (i = 0; i < count; i++) |
| 609 | amdgpu_ras_reserve_page(adev, pfn: page_pfn[i]); |
| 610 | |
| 611 | /* The problem case is as follows: |
| 612 | * 1. GPU A triggers a gpu ras reset, and GPU A drives |
| 613 | * GPU B to also perform a gpu ras reset. |
| 614 | * 2. After gpu B ras reset started, gpu B queried a DE |
| 615 | * data. Since the DE data was queried in the ras reset |
| 616 | * thread instead of the page retirement thread, bad |
| 617 | * page retirement work would not be triggered. Then |
| 618 | * even if all gpu resets are completed, the bad pages |
| 619 | * will be cached in RAM until GPU B's bad page retirement |
| 620 | * work is triggered again and then saved to eeprom. |
| 621 | * Trigger delayed work to save the bad pages to eeprom in time |
| 622 | * after gpu ras reset is completed. |
| 623 | */ |
| 624 | if (amdgpu_ras_in_recovery(adev)) |
| 625 | schedule_delayed_work(dwork: &con->page_retirement_dwork, |
| 626 | delay: msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET)); |
| 627 | |
| 628 | return 0; |
| 629 | } |
| 630 | |
| 631 | static int umc_v12_0_fill_error_record(struct amdgpu_device *adev, |
| 632 | struct ras_ecc_err *ecc_err, void *ras_error_status) |
| 633 | { |
| 634 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
| 635 | uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; |
| 636 | int ret, i, count; |
| 637 | |
| 638 | if (!err_data || !ecc_err) |
| 639 | return -EINVAL; |
| 640 | |
| 641 | memset(page_pfn, 0, sizeof(page_pfn)); |
| 642 | count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, |
| 643 | pa_addr: ecc_err->pa_pfn << AMDGPU_GPU_PAGE_SHIFT, |
| 644 | pfns: page_pfn, ARRAY_SIZE(page_pfn)); |
| 645 | |
| 646 | for (i = 0; i < count; i++) { |
| 647 | ret = amdgpu_umc_fill_error_record(err_data, |
| 648 | err_addr: ecc_err->addr, |
| 649 | retired_page: page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT, |
| 650 | channel_index: ecc_err->channel_idx, |
| 651 | MCA_IPID_2_UMC_INST(ecc_err->ipid)); |
| 652 | if (ret) |
| 653 | break; |
| 654 | } |
| 655 | |
| 656 | err_data->de_count++; |
| 657 | |
| 658 | return ret; |
| 659 | } |
| 660 | |
| 661 | static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev, |
| 662 | void *ras_error_status) |
| 663 | { |
| 664 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
| 665 | struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT]; |
| 666 | struct radix_tree_root *ecc_tree; |
| 667 | int new_detected, ret, i; |
| 668 | |
| 669 | ecc_tree = &con->umc_ecc_log.de_page_tree; |
| 670 | |
| 671 | mutex_lock(&con->umc_ecc_log.lock); |
| 672 | new_detected = radix_tree_gang_lookup_tag(ecc_tree, results: (void **)entries, |
| 673 | first_index: 0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG); |
| 674 | for (i = 0; i < new_detected; i++) { |
| 675 | if (!entries[i]) |
| 676 | continue; |
| 677 | |
| 678 | ret = umc_v12_0_fill_error_record(adev, ecc_err: entries[i], ras_error_status); |
| 679 | if (ret) { |
| 680 | dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n" , ret); |
| 681 | break; |
| 682 | } |
| 683 | radix_tree_tag_clear(ecc_tree, |
| 684 | index: entries[i]->pa_pfn, UMC_ECC_NEW_DETECTED_TAG); |
| 685 | } |
| 686 | mutex_unlock(lock: &con->umc_ecc_log.lock); |
| 687 | } |
| 688 | |
| 689 | static uint32_t umc_v12_0_get_die_id(struct amdgpu_device *adev, |
| 690 | uint64_t mca_addr, uint64_t retired_page) |
| 691 | { |
| 692 | uint32_t die = 0; |
| 693 | |
| 694 | /* we only calculate die id for nps1 mode right now */ |
| 695 | die += ((((retired_page >> 12) & 0x1ULL)^ |
| 696 | ((retired_page >> 20) & 0x1ULL) ^ |
| 697 | ((retired_page >> 27) & 0x1ULL) ^ |
| 698 | ((retired_page >> 34) & 0x1ULL) ^ |
| 699 | ((retired_page >> 41) & 0x1ULL)) << 0); |
| 700 | |
| 701 | /* the original PA_C4 and PA_R13 may be cleared in retired_page, so |
| 702 | * get them from mca_addr. |
| 703 | */ |
| 704 | die += ((((retired_page >> 13) & 0x1ULL) ^ |
| 705 | ((mca_addr >> 5) & 0x1ULL) ^ |
| 706 | ((retired_page >> 28) & 0x1ULL) ^ |
| 707 | ((mca_addr >> 23) & 0x1ULL) ^ |
| 708 | ((retired_page >> 42) & 0x1ULL)) << 1); |
| 709 | die &= 3; |
| 710 | |
| 711 | return die; |
| 712 | } |
| 713 | |
| 714 | static void umc_v12_0_mca_ipid_parse(struct amdgpu_device *adev, uint64_t ipid, |
| 715 | uint32_t *did, uint32_t *ch, uint32_t *umc_inst, uint32_t *sid) |
| 716 | { |
| 717 | if (did) |
| 718 | *did = MCA_IPID_2_DIE_ID(ipid); |
| 719 | if (ch) |
| 720 | *ch = MCA_IPID_2_UMC_CH(ipid); |
| 721 | if (umc_inst) |
| 722 | *umc_inst = MCA_IPID_2_UMC_INST(ipid); |
| 723 | if (sid) |
| 724 | *sid = MCA_IPID_2_SOCKET_ID(ipid); |
| 725 | } |
| 726 | |
| 727 | struct amdgpu_umc_ras umc_v12_0_ras = { |
| 728 | .ras_block = { |
| 729 | .hw_ops = &umc_v12_0_ras_hw_ops, |
| 730 | .ras_late_init = umc_v12_0_ras_late_init, |
| 731 | }, |
| 732 | .err_cnt_init = umc_v12_0_err_cnt_init, |
| 733 | .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, |
| 734 | .ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr, |
| 735 | .check_ecc_err_status = umc_v12_0_check_ecc_err_status, |
| 736 | .update_ecc_status = umc_v12_0_update_ecc_status, |
| 737 | .convert_ras_err_addr = umc_v12_0_convert_error_address, |
| 738 | .get_die_id_from_pa = umc_v12_0_get_die_id, |
| 739 | .get_retire_flip_bits = umc_v12_0_get_retire_flip_bits, |
| 740 | .mca_ipid_parse = umc_v12_0_mca_ipid_parse, |
| 741 | }; |
| 742 | |
| 743 | |