提交 513befa6 编写于 作者: S Stanley.Yang 提交者: Alex Deucher

drm/amdgpu: message smu to update hbm bad page number

Use SMU to update the bad pages rather than directly
accessing the EEPROM from the driver.
Signed-off-by: NStanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: NJohn Clements <john.clements@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 7c5f3d7d
...@@ -1984,6 +1984,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1984,6 +1984,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
ret = amdgpu_ras_load_bad_pages(adev); ret = amdgpu_ras_load_bad_pages(adev);
if (ret) if (ret)
goto free; goto free;
if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.num_recs);
} }
return 0; return 0;
......
...@@ -94,6 +94,7 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -94,6 +94,7 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (adev->umc.ras_funcs && if (adev->umc.ras_funcs &&
...@@ -131,6 +132,9 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -131,6 +132,9 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
amdgpu_ras_add_bad_pages(adev, err_data->err_addr, amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
err_data->err_addr_cnt); err_data->err_addr_cnt);
amdgpu_ras_save_bad_pages(adev); amdgpu_ras_save_bad_pages(adev);
if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.num_recs);
} }
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
......
...@@ -1232,6 +1232,12 @@ struct pptable_funcs { ...@@ -1232,6 +1232,12 @@ struct pptable_funcs {
*/ */
int (*wait_for_event)(struct smu_context *smu, int (*wait_for_event)(struct smu_context *smu,
enum smu_event_type event, uint64_t event_arg); enum smu_event_type event, uint64_t event_arg);
/**
* @sned_hbm_bad_pages_num: message SMU to update bad page number
* of SMUBUS table.
*/
int (*send_hbm_bad_pages_num)(struct smu_context *smu, uint32_t size);
}; };
typedef enum { typedef enum {
......
...@@ -1923,6 +1923,20 @@ static int aldebaran_set_mp1_state(struct smu_context *smu, ...@@ -1923,6 +1923,20 @@ static int aldebaran_set_mp1_state(struct smu_context *smu,
} }
} }
static int aldebaran_smu_send_hbm_bad_page_num(struct smu_context *smu,
uint32_t size)
{
int ret = 0;
/* message SMU to update the bad page number on SMUBUS */
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetNumBadHbmPagesRetired, size, NULL);
if (ret)
dev_err(smu->adev->dev, "[%s] failed to message SMU to update HBM bad pages number\n",
__func__);
return ret;
}
static const struct pptable_funcs aldebaran_ppt_funcs = { static const struct pptable_funcs aldebaran_ppt_funcs = {
/* init dpm */ /* init dpm */
.get_allowed_feature_mask = aldebaran_get_allowed_feature_mask, .get_allowed_feature_mask = aldebaran_get_allowed_feature_mask,
...@@ -1985,6 +1999,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = { ...@@ -1985,6 +1999,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
.wait_for_event = smu_v13_0_wait_for_event, .wait_for_event = smu_v13_0_wait_for_event,
.i2c_init = aldebaran_i2c_control_init, .i2c_init = aldebaran_i2c_control_init,
.i2c_fini = aldebaran_i2c_control_fini, .i2c_fini = aldebaran_i2c_control_fini,
.send_hbm_bad_pages_num = aldebaran_smu_send_hbm_bad_page_num,
}; };
void aldebaran_set_ppt_funcs(struct smu_context *smu) void aldebaran_set_ppt_funcs(struct smu_context *smu)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册