| 1 | /* |
| 2 | * Copyright (C) 2021 Advanced Micro Devices, Inc. |
| 3 | * |
| 4 | * Permission is hereby granted, free of charge, to any person obtaining a |
| 5 | * copy of this software and associated documentation files (the "Software"), |
| 6 | * to deal in the Software without restriction, including without limitation |
| 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 8 | * and/or sell copies of the Software, and to permit persons to whom the |
| 9 | * Software is furnished to do so, subject to the following conditions: |
| 10 | * |
| 11 | * The above copyright notice and this permission notice shall be included |
| 12 | * in all copies or substantial portions of the Software. |
| 13 | * |
| 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| 15 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| 17 | * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN |
| 18 | * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 20 | */ |
| 21 | #ifndef __AMDGPU_MCA_H__ |
| 22 | #define __AMDGPU_MCA_H__ |
| 23 | |
| 24 | #include "amdgpu_ras.h" |
| 25 | |
| 26 | #define MCA_MAX_REGS_COUNT (16) |
| 27 | |
| 28 | #define MCA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) |
| 29 | #define MCA_REG__STATUS__VAL(x) MCA_REG_FIELD(x, 63, 63) |
| 30 | #define MCA_REG__STATUS__OVERFLOW(x) MCA_REG_FIELD(x, 62, 62) |
| 31 | #define MCA_REG__STATUS__UC(x) MCA_REG_FIELD(x, 61, 61) |
| 32 | #define MCA_REG__STATUS__EN(x) MCA_REG_FIELD(x, 60, 60) |
| 33 | #define MCA_REG__STATUS__MISCV(x) MCA_REG_FIELD(x, 59, 59) |
| 34 | #define MCA_REG__STATUS__ADDRV(x) MCA_REG_FIELD(x, 58, 58) |
| 35 | #define MCA_REG__STATUS__PCC(x) MCA_REG_FIELD(x, 57, 57) |
| 36 | #define MCA_REG__STATUS__ERRCOREIDVAL(x) MCA_REG_FIELD(x, 56, 56) |
| 37 | #define MCA_REG__STATUS__TCC(x) MCA_REG_FIELD(x, 55, 55) |
| 38 | #define MCA_REG__STATUS__SYNDV(x) MCA_REG_FIELD(x, 53, 53) |
| 39 | #define MCA_REG__STATUS__CECC(x) MCA_REG_FIELD(x, 46, 46) |
| 40 | #define MCA_REG__STATUS__UECC(x) MCA_REG_FIELD(x, 45, 45) |
| 41 | #define MCA_REG__STATUS__DEFERRED(x) MCA_REG_FIELD(x, 44, 44) |
| 42 | #define MCA_REG__STATUS__POISON(x) MCA_REG_FIELD(x, 43, 43) |
| 43 | #define MCA_REG__STATUS__SCRUB(x) MCA_REG_FIELD(x, 40, 40) |
| 44 | #define MCA_REG__STATUS__ERRCOREID(x) MCA_REG_FIELD(x, 37, 32) |
| 45 | #define MCA_REG__STATUS__ADDRLSB(x) MCA_REG_FIELD(x, 29, 24) |
| 46 | #define MCA_REG__STATUS__ERRORCODEEXT(x) MCA_REG_FIELD(x, 21, 16) |
| 47 | #define MCA_REG__STATUS__ERRORCODE(x) MCA_REG_FIELD(x, 15, 0) |
| 48 | |
| 49 | #define MCA_REG__MISC0__ERRCNT(x) MCA_REG_FIELD(x, 43, 32) |
| 50 | |
| 51 | #define MCA_REG__SYND__ERRORINFORMATION(x) MCA_REG_FIELD(x, 17, 0) |
| 52 | |
| 53 | enum amdgpu_mca_ip { |
| 54 | AMDGPU_MCA_IP_UNKNOW = -1, |
| 55 | AMDGPU_MCA_IP_PSP = 0, |
| 56 | AMDGPU_MCA_IP_SDMA, |
| 57 | AMDGPU_MCA_IP_GC, |
| 58 | AMDGPU_MCA_IP_SMU, |
| 59 | AMDGPU_MCA_IP_MP5, |
| 60 | AMDGPU_MCA_IP_UMC, |
| 61 | AMDGPU_MCA_IP_PCS_XGMI, |
| 62 | AMDGPU_MCA_IP_COUNT, |
| 63 | }; |
| 64 | |
| 65 | enum amdgpu_mca_error_type { |
| 66 | AMDGPU_MCA_ERROR_TYPE_UE = 0, |
| 67 | AMDGPU_MCA_ERROR_TYPE_CE, |
| 68 | AMDGPU_MCA_ERROR_TYPE_DE, |
| 69 | }; |
| 70 | |
| 71 | struct amdgpu_mca_ras_block { |
| 72 | struct amdgpu_ras_block_object ras_block; |
| 73 | }; |
| 74 | |
| 75 | struct amdgpu_mca_ras { |
| 76 | struct ras_common_if *ras_if; |
| 77 | struct amdgpu_mca_ras_block *ras; |
| 78 | }; |
| 79 | |
| 80 | struct mca_bank_set { |
| 81 | int nr_entries; |
| 82 | struct list_head list; |
| 83 | }; |
| 84 | |
| 85 | struct mca_bank_cache { |
| 86 | struct mca_bank_set mca_set; |
| 87 | struct mutex lock; |
| 88 | }; |
| 89 | |
| 90 | struct amdgpu_mca { |
| 91 | struct amdgpu_mca_ras mp0; |
| 92 | struct amdgpu_mca_ras mp1; |
| 93 | struct amdgpu_mca_ras mpio; |
| 94 | const struct amdgpu_mca_smu_funcs *mca_funcs; |
| 95 | struct mca_bank_cache mca_caches[AMDGPU_MCA_ERROR_TYPE_DE]; |
| 96 | atomic_t ue_update_flag; |
| 97 | }; |
| 98 | |
| 99 | enum mca_reg_idx { |
| 100 | MCA_REG_IDX_STATUS = 1, |
| 101 | MCA_REG_IDX_ADDR = 2, |
| 102 | MCA_REG_IDX_MISC0 = 3, |
| 103 | MCA_REG_IDX_IPID = 5, |
| 104 | MCA_REG_IDX_SYND = 6, |
| 105 | MCA_REG_IDX_COUNT = 16, |
| 106 | }; |
| 107 | |
| 108 | struct mca_bank_info { |
| 109 | int socket_id; |
| 110 | int aid; |
| 111 | int hwid; |
| 112 | int mcatype; |
| 113 | }; |
| 114 | |
| 115 | struct mca_bank_entry { |
| 116 | int idx; |
| 117 | enum amdgpu_mca_error_type type; |
| 118 | enum amdgpu_mca_ip ip; |
| 119 | struct mca_bank_info info; |
| 120 | uint64_t regs[MCA_MAX_REGS_COUNT]; |
| 121 | }; |
| 122 | |
| 123 | struct mca_bank_node { |
| 124 | struct mca_bank_entry entry; |
| 125 | struct list_head node; |
| 126 | }; |
| 127 | |
| 128 | struct amdgpu_mca_smu_funcs { |
| 129 | int max_ue_count; |
| 130 | int max_ce_count; |
| 131 | int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable); |
| 132 | int (*mca_parse_mca_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, |
| 133 | struct mca_bank_entry *entry, uint32_t *count); |
| 134 | int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, |
| 135 | uint32_t *count); |
| 136 | int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, |
| 137 | int idx, struct mca_bank_entry *entry); |
| 138 | }; |
| 139 | |
| 140 | void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, |
| 141 | uint64_t mc_status_addr, |
| 142 | unsigned long *error_count); |
| 143 | |
| 144 | void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, |
| 145 | uint64_t mc_status_addr, |
| 146 | unsigned long *error_count); |
| 147 | |
| 148 | void amdgpu_mca_reset_error_count(struct amdgpu_device *adev, |
| 149 | uint64_t mc_status_addr); |
| 150 | |
| 151 | void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev, |
| 152 | uint64_t mc_status_addr, |
| 153 | void *ras_error_status); |
| 154 | int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev); |
| 155 | int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev); |
| 156 | int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev); |
| 157 | |
| 158 | void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs); |
| 159 | int amdgpu_mca_init(struct amdgpu_device *adev); |
| 160 | void amdgpu_mca_fini(struct amdgpu_device *adev); |
| 161 | int amdgpu_mca_reset(struct amdgpu_device *adev); |
| 162 | int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable); |
| 163 | int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, |
| 164 | enum amdgpu_mca_error_type type, uint32_t *total); |
| 165 | void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); |
| 166 | int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, |
| 167 | struct ras_err_data *err_data, struct ras_query_context *qctx); |
| 168 | |
| 169 | #endif |
| 170 | |