| 1 | // SPDX-License-Identifier: MIT |
| 2 | /* |
| 3 | * Copyright 2025 Advanced Micro Devices, Inc. |
| 4 | * |
| 5 | * Permission is hereby granted, free of charge, to any person obtaining a |
| 6 | * copy of this software and associated documentation files (the "Software"), |
| 7 | * to deal in the Software without restriction, including without limitation |
| 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 9 | * and/or sell copies of the Software, and to permit persons to whom the |
| 10 | * Software is furnished to do so, subject to the following conditions: |
| 11 | * |
| 12 | * The above copyright notice and this permission notice shall be included in |
| 13 | * all copies or substantial portions of the Software. |
| 14 | * |
| 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| 18 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| 19 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| 20 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 21 | * OTHER DEALINGS IN THE SOFTWARE. |
| 22 | * |
| 23 | */ |
| 24 | #include "ras.h" |
| 25 | #include "ras_aca.h" |
| 26 | #include "ras_core_status.h" |
| 27 | #include "ras_aca_v1_0.h" |
| 28 | |
| 29 | struct ras_aca_hwip { |
| 30 | int hwid; |
| 31 | int mcatype; |
| 32 | }; |
| 33 | |
| 34 | static struct ras_aca_hwip aca_hwid_mcatypes[ACA_ECC_HWIP_COUNT] = { |
| 35 | [ACA_ECC_HWIP__SMU] = {0x01, 0x01}, |
| 36 | [ACA_ECC_HWIP__PCS_XGMI] = {0x50, 0x00}, |
| 37 | [ACA_ECC_HWIP__UMC] = {0x96, 0x00}, |
| 38 | }; |
| 39 | |
| 40 | static int aca_decode_bank_info(struct aca_block *aca_blk, |
| 41 | struct aca_bank_reg *bank, struct aca_ecc_info *info) |
| 42 | { |
| 43 | u64 ipid; |
| 44 | u32 instidhi, instidlo; |
| 45 | |
| 46 | ipid = bank->regs[ACA_REG_IDX__IPID]; |
| 47 | info->hwid = ACA_REG_IPID_HARDWAREID(ipid); |
| 48 | info->mcatype = ACA_REG_IPID_MCATYPE(ipid); |
| 49 | /* |
| 50 | * Unified DieID Format: SAASS. A:AID, S:Socket. |
| 51 | * Unified DieID[4:4] = InstanceId[0:0] |
| 52 | * Unified DieID[0:3] = InstanceIdHi[0:3] |
| 53 | */ |
| 54 | instidhi = ACA_REG_IPID_INSTANCEIDHI(ipid); |
| 55 | instidlo = ACA_REG_IPID_INSTANCEIDLO(ipid); |
| 56 | info->die_id = ((instidhi >> 2) & 0x03); |
| 57 | info->socket_id = ((instidlo & 0x1) << 2) | (instidhi & 0x03); |
| 58 | |
| 59 | if ((aca_blk->blk_info->hwip == ACA_ECC_HWIP__SMU) && |
| 60 | (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX)) |
| 61 | info->xcd_id = |
| 62 | ((instidlo & GENMASK_ULL(31, 1)) == mmSMNAID_XCD0_MCA_SMU) ? 0 : 1; |
| 63 | |
| 64 | return 0; |
| 65 | } |
| 66 | |
| 67 | static bool aca_check_bank_hwip(struct aca_bank_reg *bank, enum aca_ecc_hwip type) |
| 68 | { |
| 69 | struct ras_aca_hwip *hwip; |
| 70 | int hwid, mcatype; |
| 71 | u64 ipid; |
| 72 | |
| 73 | if (!bank || (type == ACA_ECC_HWIP__UNKNOWN)) |
| 74 | return false; |
| 75 | |
| 76 | hwip = &aca_hwid_mcatypes[type]; |
| 77 | if (!hwip->hwid) |
| 78 | return false; |
| 79 | |
| 80 | ipid = bank->regs[ACA_REG_IDX__IPID]; |
| 81 | hwid = ACA_REG_IPID_HARDWAREID(ipid); |
| 82 | mcatype = ACA_REG_IPID_MCATYPE(ipid); |
| 83 | |
| 84 | return hwip->hwid == hwid && hwip->mcatype == mcatype; |
| 85 | } |
| 86 | |
| 87 | static bool aca_match_bank_default(struct aca_block *aca_blk, void *data) |
| 88 | { |
| 89 | return aca_check_bank_hwip(bank: (struct aca_bank_reg *)data, type: aca_blk->blk_info->hwip); |
| 90 | } |
| 91 | |
| 92 | static bool aca_match_gfx_bank(struct aca_block *aca_blk, void *data) |
| 93 | { |
| 94 | struct aca_bank_reg *bank = (struct aca_bank_reg *)data; |
| 95 | u32 instlo; |
| 96 | |
| 97 | if (!aca_check_bank_hwip(bank, type: aca_blk->blk_info->hwip)) |
| 98 | return false; |
| 99 | |
| 100 | instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); |
| 101 | instlo &= GENMASK_ULL(31, 1); |
| 102 | switch (instlo) { |
| 103 | case mmSMNAID_XCD0_MCA_SMU: |
| 104 | case mmSMNAID_XCD1_MCA_SMU: |
| 105 | case mmSMNXCD_XCD0_MCA_SMU: |
| 106 | return true; |
| 107 | default: |
| 108 | break; |
| 109 | } |
| 110 | |
| 111 | return false; |
| 112 | } |
| 113 | |
| 114 | static bool aca_match_sdma_bank(struct aca_block *aca_blk, void *data) |
| 115 | { |
| 116 | struct aca_bank_reg *bank = (struct aca_bank_reg *)data; |
| 117 | /* CODE_SDMA0 - CODE_SDMA4, reference to smu driver if header file */ |
| 118 | static int sdma_err_codes[] = { 33, 34, 35, 36 }; |
| 119 | u32 instlo; |
| 120 | int errcode, i; |
| 121 | |
| 122 | if (!aca_check_bank_hwip(bank, type: aca_blk->blk_info->hwip)) |
| 123 | return false; |
| 124 | |
| 125 | instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); |
| 126 | instlo &= GENMASK_ULL(31, 1); |
| 127 | if (instlo != mmSMNAID_AID0_MCA_SMU) |
| 128 | return false; |
| 129 | |
| 130 | errcode = ACA_REG_SYND_ERRORINFORMATION(bank->regs[ACA_REG_IDX__SYND]); |
| 131 | errcode &= 0xff; |
| 132 | |
| 133 | /* Check SDMA error codes */ |
| 134 | for (i = 0; i < ARRAY_SIZE(sdma_err_codes); i++) { |
| 135 | if (errcode == sdma_err_codes[i]) |
| 136 | return true; |
| 137 | } |
| 138 | |
| 139 | return false; |
| 140 | } |
| 141 | |
| 142 | static bool aca_match_mmhub_bank(struct aca_block *aca_blk, void *data) |
| 143 | { |
| 144 | struct aca_bank_reg *bank = (struct aca_bank_reg *)data; |
| 145 | /* reference to smu driver if header file */ |
| 146 | const int mmhub_err_codes[] = { |
| 147 | 0, 1, 2, 3, 4, /* CODE_DAGB0 - 4 */ |
| 148 | 5, 6, 7, 8, 9, /* CODE_EA0 - 4 */ |
| 149 | 10, /* CODE_UTCL2_ROUTER */ |
| 150 | 11, /* CODE_VML2 */ |
| 151 | 12, /* CODE_VML2_WALKER */ |
| 152 | 13, /* CODE_MMCANE */ |
| 153 | }; |
| 154 | u32 instlo; |
| 155 | int errcode, i; |
| 156 | |
| 157 | if (!aca_check_bank_hwip(bank, type: aca_blk->blk_info->hwip)) |
| 158 | return false; |
| 159 | |
| 160 | instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); |
| 161 | instlo &= GENMASK_ULL(31, 1); |
| 162 | if (instlo != mmSMNAID_AID0_MCA_SMU) |
| 163 | return false; |
| 164 | |
| 165 | errcode = ACA_REG_SYND_ERRORINFORMATION(bank->regs[ACA_REG_IDX__SYND]); |
| 166 | errcode &= 0xff; |
| 167 | |
| 168 | /* Check MMHUB error codes */ |
| 169 | for (i = 0; i < ARRAY_SIZE(mmhub_err_codes); i++) { |
| 170 | if (errcode == mmhub_err_codes[i]) |
| 171 | return true; |
| 172 | } |
| 173 | |
| 174 | return false; |
| 175 | } |
| 176 | |
| 177 | static bool aca_check_umc_de(struct ras_core_context *ras_core, uint64_t mc_umc_status) |
| 178 | { |
| 179 | return (ras_core->poison_supported && |
| 180 | ACA_REG_STATUS_VAL(mc_umc_status) && |
| 181 | ACA_REG_STATUS_DEFERRED(mc_umc_status)); |
| 182 | } |
| 183 | |
| 184 | static bool aca_check_umc_ue(struct ras_core_context *ras_core, uint64_t mc_umc_status) |
| 185 | { |
| 186 | if (aca_check_umc_de(ras_core, mc_umc_status)) |
| 187 | return false; |
| 188 | |
| 189 | return (ACA_REG_STATUS_VAL(mc_umc_status) && |
| 190 | (ACA_REG_STATUS_PCC(mc_umc_status) || |
| 191 | ACA_REG_STATUS_UC(mc_umc_status) || |
| 192 | ACA_REG_STATUS_TCC(mc_umc_status))); |
| 193 | } |
| 194 | |
| 195 | static bool aca_check_umc_ce(struct ras_core_context *ras_core, uint64_t mc_umc_status) |
| 196 | { |
| 197 | if (aca_check_umc_de(ras_core, mc_umc_status)) |
| 198 | return false; |
| 199 | |
| 200 | return (ACA_REG_STATUS_VAL(mc_umc_status) && |
| 201 | (ACA_REG_STATUS_CECC(mc_umc_status) || |
| 202 | (ACA_REG_STATUS_UECC(mc_umc_status) && |
| 203 | ACA_REG_STATUS_UC(mc_umc_status) == 0) || |
| 204 | /* Identify data parity error in replay mode */ |
| 205 | ((ACA_REG_STATUS_ERRORCODEEXT(mc_umc_status) == 0x5 || |
| 206 | ACA_REG_STATUS_ERRORCODEEXT(mc_umc_status) == 0xb) && |
| 207 | !(aca_check_umc_ue(ras_core, mc_umc_status))))); |
| 208 | } |
| 209 | |
| 210 | static int aca_parse_umc_bank(struct ras_core_context *ras_core, |
| 211 | struct aca_block *ras_blk, void *data, void *buf) |
| 212 | { |
| 213 | struct aca_bank_reg *bank = (struct aca_bank_reg *)data; |
| 214 | struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; |
| 215 | struct aca_ecc_info bank_info; |
| 216 | uint32_t ext_error_code; |
| 217 | uint64_t status0; |
| 218 | |
| 219 | status0 = bank->regs[ACA_REG_IDX__STATUS]; |
| 220 | if (!ACA_REG_STATUS_VAL(status0)) |
| 221 | return 0; |
| 222 | |
| 223 | memset(&bank_info, 0, sizeof(bank_info)); |
| 224 | aca_decode_bank_info(aca_blk: ras_blk, bank, info: &bank_info); |
| 225 | memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); |
| 226 | ecc->bank_info.status = bank->regs[ACA_REG_IDX__STATUS]; |
| 227 | ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; |
| 228 | ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; |
| 229 | |
| 230 | ext_error_code = ACA_REG_STATUS_ERRORCODEEXT(status0); |
| 231 | |
| 232 | if (aca_check_umc_de(ras_core, mc_umc_status: status0)) |
| 233 | ecc->de_count = 1; |
| 234 | else if (aca_check_umc_ue(ras_core, mc_umc_status: status0)) |
| 235 | ecc->ue_count = ext_error_code ? |
| 236 | 1 : ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); |
| 237 | else if (aca_check_umc_ce(ras_core, mc_umc_status: status0)) |
| 238 | ecc->ce_count = ext_error_code ? |
| 239 | 1 : ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); |
| 240 | |
| 241 | return 0; |
| 242 | } |
| 243 | |
| 244 | static bool aca_check_bank_is_de(struct ras_core_context *ras_core, |
| 245 | uint64_t status) |
| 246 | { |
| 247 | return (ACA_REG_STATUS_POISON(status) || |
| 248 | ACA_REG_STATUS_DEFERRED(status)); |
| 249 | } |
| 250 | |
| 251 | static int aca_parse_bank_default(struct ras_core_context *ras_core, |
| 252 | struct aca_block *ras_blk, |
| 253 | void *data, void *buf) |
| 254 | { |
| 255 | struct aca_bank_reg *bank = (struct aca_bank_reg *)data; |
| 256 | struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; |
| 257 | struct aca_ecc_info bank_info; |
| 258 | u64 misc0 = bank->regs[ACA_REG_IDX__MISC0]; |
| 259 | u64 status = bank->regs[ACA_REG_IDX__STATUS]; |
| 260 | |
| 261 | memset(&bank_info, 0, sizeof(bank_info)); |
| 262 | aca_decode_bank_info(aca_blk: ras_blk, bank, info: &bank_info); |
| 263 | memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); |
| 264 | ecc->bank_info.status = status; |
| 265 | ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; |
| 266 | ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; |
| 267 | |
| 268 | if (aca_check_bank_is_de(ras_core, status)) { |
| 269 | ecc->de_count = 1; |
| 270 | } else { |
| 271 | if (bank->ecc_type == RAS_ERR_TYPE__UE) |
| 272 | ecc->ue_count = 1; |
| 273 | else if (bank->ecc_type == RAS_ERR_TYPE__CE) |
| 274 | ecc->ce_count = ACA_REG_MISC0_ERRCNT(misc0); |
| 275 | } |
| 276 | |
| 277 | return 0; |
| 278 | } |
| 279 | |
| 280 | static int aca_parse_xgmi_bank(struct ras_core_context *ras_core, |
| 281 | struct aca_block *ras_blk, |
| 282 | void *data, void *buf) |
| 283 | { |
| 284 | struct aca_bank_reg *bank = (struct aca_bank_reg *)data; |
| 285 | struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; |
| 286 | struct aca_ecc_info bank_info; |
| 287 | u64 status, count; |
| 288 | int ext_error_code; |
| 289 | |
| 290 | memset(&bank_info, 0, sizeof(bank_info)); |
| 291 | aca_decode_bank_info(aca_blk: ras_blk, bank, info: &bank_info); |
| 292 | memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); |
| 293 | ecc->bank_info.status = bank->regs[ACA_REG_IDX__STATUS]; |
| 294 | ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; |
| 295 | ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; |
| 296 | |
| 297 | status = bank->regs[ACA_REG_IDX__STATUS]; |
| 298 | ext_error_code = ACA_REG_STATUS_ERRORCODEEXT(status); |
| 299 | |
| 300 | count = ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); |
| 301 | if (bank->ecc_type == RAS_ERR_TYPE__UE) { |
| 302 | if (ext_error_code != 0 && ext_error_code != 9) |
| 303 | count = 0ULL; |
| 304 | ecc->ue_count = count; |
| 305 | } else if (bank->ecc_type == RAS_ERR_TYPE__CE) { |
| 306 | count = ext_error_code == 6 ? count : 0ULL; |
| 307 | ecc->ce_count = count; |
| 308 | } |
| 309 | |
| 310 | return 0; |
| 311 | } |
| 312 | |
| 313 | static const struct aca_block_info aca_v1_0_umc = { |
| 314 | .name = "umc" , |
| 315 | .ras_block_id = RAS_BLOCK_ID__UMC, |
| 316 | .hwip = ACA_ECC_HWIP__UMC, |
| 317 | .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK | ACA_ERROR__DE_MASK, |
| 318 | .bank_ops = { |
| 319 | .bank_match = aca_match_bank_default, |
| 320 | .bank_parse = aca_parse_umc_bank, |
| 321 | }, |
| 322 | }; |
| 323 | |
| 324 | static const struct aca_block_info aca_v1_0_gfx = { |
| 325 | .name = "gfx" , |
| 326 | .ras_block_id = RAS_BLOCK_ID__GFX, |
| 327 | .hwip = ACA_ECC_HWIP__SMU, |
| 328 | .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK, |
| 329 | .bank_ops = { |
| 330 | .bank_match = aca_match_gfx_bank, |
| 331 | .bank_parse = aca_parse_bank_default, |
| 332 | }, |
| 333 | }; |
| 334 | |
| 335 | static const struct aca_block_info aca_v1_0_sdma = { |
| 336 | .name = "sdma" , |
| 337 | .ras_block_id = RAS_BLOCK_ID__SDMA, |
| 338 | .hwip = ACA_ECC_HWIP__SMU, |
| 339 | .mask = ACA_ERROR__UE_MASK, |
| 340 | .bank_ops = { |
| 341 | .bank_match = aca_match_sdma_bank, |
| 342 | .bank_parse = aca_parse_bank_default, |
| 343 | }, |
| 344 | }; |
| 345 | |
| 346 | static const struct aca_block_info aca_v1_0_mmhub = { |
| 347 | .name = "mmhub" , |
| 348 | .ras_block_id = RAS_BLOCK_ID__MMHUB, |
| 349 | .hwip = ACA_ECC_HWIP__SMU, |
| 350 | .mask = ACA_ERROR__UE_MASK, |
| 351 | .bank_ops = { |
| 352 | .bank_match = aca_match_mmhub_bank, |
| 353 | .bank_parse = aca_parse_bank_default, |
| 354 | }, |
| 355 | }; |
| 356 | |
| 357 | static const struct aca_block_info aca_v1_0_xgmi = { |
| 358 | .name = "xgmi" , |
| 359 | .ras_block_id = RAS_BLOCK_ID__XGMI_WAFL, |
| 360 | .hwip = ACA_ECC_HWIP__PCS_XGMI, |
| 361 | .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK, |
| 362 | .bank_ops = { |
| 363 | .bank_match = aca_match_bank_default, |
| 364 | .bank_parse = aca_parse_xgmi_bank, |
| 365 | }, |
| 366 | }; |
| 367 | |
| 368 | static const struct aca_block_info *aca_block_info_v1_0[] = { |
| 369 | &aca_v1_0_umc, |
| 370 | &aca_v1_0_gfx, |
| 371 | &aca_v1_0_sdma, |
| 372 | &aca_v1_0_mmhub, |
| 373 | &aca_v1_0_xgmi, |
| 374 | }; |
| 375 | |
| 376 | const struct ras_aca_ip_func ras_aca_func_v1_0 = { |
| 377 | .block_num = ARRAY_SIZE(aca_block_info_v1_0), |
| 378 | .block_info = aca_block_info_v1_0, |
| 379 | }; |
| 380 | |