提交 78ad00c9 编写于 作者: T Tao Zhou 提交者: Alex Deucher

drm/amdgpu: Hook EEPROM table to RAS

support eeprom records load and save for ras,
move EEPROM records storing to bad page reserving

v2: remove redundant check for con->eh_data
Signed-off-by: NTao Zhou <tao.zhou1@amd.com>
Signed-off-by: NAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: NGuchun Chen <guchun.chen@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 9dc23a63
...@@ -1366,6 +1366,69 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, ...@@ -1366,6 +1366,69 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
return ret; return ret;
} }
/*
* write error record array to eeprom, the function should be
* protected by recovery_lock
*/
static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras.ras->eeprom_control;
int save_count;
if (!con || !con->eh_data)
return 0;
data = con->eh_data;
save_count = data->count - control->num_recs;
/* only new entries are saved */
if (save_count > 0)
if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control,
&data->bps[control->num_recs],
true,
save_count)) {
DRM_ERROR("Failed to save EEPROM table data!");
return -EIO;
}
return 0;
}
/*
* read error record array in eeprom and reserve enough space for
* storing new bad pages
*/
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
{
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras.ras->eeprom_control;
struct eeprom_table_record *bps = NULL;
int ret = 0;
/* no bad page record, skip eeprom access */
if (!control->num_recs)
return ret;
bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
if (!bps)
return -ENOMEM;
if (amdgpu_ras_eeprom_process_recods(control, bps, false,
control->num_recs)) {
DRM_ERROR("Failed to load EEPROM table records!");
ret = -EIO;
goto out;
}
ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
out:
kfree(bps);
return ret;
}
/* called in gpu recovery/init */ /* called in gpu recovery/init */
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
{ {
...@@ -1373,7 +1436,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) ...@@ -1373,7 +1436,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
struct ras_err_handler_data *data; struct ras_err_handler_data *data;
uint64_t bp; uint64_t bp;
struct amdgpu_bo *bo; struct amdgpu_bo *bo;
int i; int i, ret = 0;
if (!con || !con->eh_data) if (!con || !con->eh_data)
return 0; return 0;
...@@ -1393,9 +1456,12 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) ...@@ -1393,9 +1456,12 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
data->bps_bo[i] = bo; data->bps_bo[i] = bo;
data->last_reserved = i + 1; data->last_reserved = i + 1;
} }
/* continue to save bad pages to eeprom even reesrve_vram fails */
ret = amdgpu_ras_save_bad_pages(adev);
out: out:
mutex_unlock(&con->recovery_lock); mutex_unlock(&con->recovery_lock);
return 0; return ret;
} }
/* called when driver unload */ /* called when driver unload */
...@@ -1427,33 +1493,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) ...@@ -1427,33 +1493,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
return 0; return 0;
} }
static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
{
/* TODO
* write the array to eeprom when SMU disabled.
*/
return 0;
}
/*
* read error record array in eeprom and reserve enough space for
* storing new bad pages
*/
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
{
struct eeprom_table_record *bps = NULL;
int ret;
ret = amdgpu_ras_add_bad_pages(adev, bps,
adev->umc.max_ras_err_cnt_per_query);
return ret;
}
static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data = &con->eh_data; struct ras_err_handler_data **data = &con->eh_data;
int ret;
*data = kmalloc(sizeof(**data), *data = kmalloc(sizeof(**data),
GFP_KERNEL|__GFP_ZERO); GFP_KERNEL|__GFP_ZERO);
...@@ -1465,8 +1509,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1465,8 +1509,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0); atomic_set(&con->in_recovery, 0);
con->adev = adev; con->adev = adev;
amdgpu_ras_load_bad_pages(adev); ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
amdgpu_ras_reserve_bad_pages(adev); if (ret)
return ret;
if (adev->psp.ras.ras->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
if (ret)
return ret;
ret = amdgpu_ras_reserve_bad_pages(adev);
if (ret)
return ret;
}
return 0; return 0;
} }
...@@ -1477,7 +1531,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) ...@@ -1477,7 +1531,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
struct ras_err_handler_data *data = con->eh_data; struct ras_err_handler_data *data = con->eh_data;
cancel_work_sync(&con->recovery_work); cancel_work_sync(&con->recovery_work);
amdgpu_ras_save_bad_pages(adev);
amdgpu_ras_release_bad_pages(adev); amdgpu_ras_release_bad_pages(adev);
mutex_lock(&con->recovery_lock); mutex_lock(&con->recovery_lock);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册