diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 28e603243b6729be6938b5c22fd88aa5379d6f18..bf5a95104ec1153a26832c668eaaeba3eb2f8846 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -333,6 +333,11 @@ struct ecc_info_per_ch { struct umc_ecc_info { struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM]; + + /* Determine smu ecctable whether support + * record correctable error address + */ + int record_ce_addr_supported; }; struct amdgpu_ras { diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c index 606892dbea1c723b3f4ef40f572e5948a730ab74..bf7524f16b669c767fc432bc2ba17bd0849ac935 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c @@ -119,6 +119,24 @@ static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *error_count += 1; umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset); + + if (ras->umc_ecc.record_ce_addr_supported) { + uint64_t err_addr, soc_pa; + uint32_t channel_index = + adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; + + err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr; + err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + /* translate umc channel address to soc pa, 3 parts are included */ + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | + ADDR_OF_256B_BLOCK(channel_index) | + OFFSET_IN_256B_BLOCK(err_addr); + + /* The umc channel bits are not original values, they are hashed */ + SET_CHANNEL_HASH(channel_index, soc_pa); + + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa); + } } } @@ -251,7 +269,9 @@ static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev, uint32_t umc_reg_offset, - unsigned long *error_count) + unsigned long *error_count, + uint32_t ch_inst, + uint32_t umc_inst) { uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; uint32_t ecc_err_cnt, ecc_err_cnt_addr; @@ -295,6 +315,31 @@ static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev, *error_count += 1; umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset); + + { + uint64_t err_addr, soc_pa; + uint32_t mc_umc_addrt0; + uint32_t channel_index; + + mc_umc_addrt0 = + SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); + + channel_index = + adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; + + err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); + err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + + /* translate umc channel address to soc pa, 3 parts are included */ + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | + ADDR_OF_256B_BLOCK(channel_index) | + OFFSET_IN_256B_BLOCK(err_addr); + + /* The umc channel bits are not original values, they are hashed */ + SET_CHANNEL_HASH(channel_index, soc_pa); + + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa); + } } } @@ -395,7 +440,8 @@ static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev, ch_inst); umc_v6_7_query_correctable_error_count(adev, umc_reg_offset, - &(err_data->ce_count)); + &(err_data->ce_count), + ch_inst, umc_inst); umc_v6_7_querry_uncorrectable_error_count(adev, umc_reg_offset, &(err_data->ue_count)); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index bf124bc98b804d4baace560e6e90f706a3ae4a77..bb3c238213239361fab4360aff0369385a4c68f6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1884,6 +1884,7 @@ static ssize_t aldebaran_get_ecc_info(struct smu_context *smu, ecc_info_per_channel->mca_ceumc_addr = ecc_table->EccInfo_V2[i].mca_ceumc_addr; } + eccinfo->record_ce_addr_supported = 1; } return ret;