From 66f879496121c18c541125d04a444c62f3ca82ad Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 19 Apr 2022 11:04:19 +0800 Subject: [PATCH] drm/amdgpu: add RAS poison consumption handler (v2) Add support for general RAS poison consumption handler. v2: remove callback function for poison consumption. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 2 files changed, 35 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f10afc1ea017..1c86ec9ab139 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1515,6 +1515,38 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) /* ras fs end */ /* ih begin */ +static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + bool poison_stat = true, need_reset = true; + struct amdgpu_device *adev = obj->adev; + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras_block_object *block_obj = + amdgpu_ras_get_ras_block(adev, obj->head.block, 0); + + if (!adev->gmc.xgmi.connected_to_cpu) + amdgpu_umc_poison_handler(adev, &err_data, false); + + /* both query_poison_status and handle_poison_consumption are optional */ + if (block_obj && block_obj->hw_ops) { + if (block_obj->hw_ops->query_poison_status) { + poison_stat = block_obj->hw_ops->query_poison_status(adev); + if (!poison_stat) + dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", + block_obj->ras_comm.name); + } + + if (poison_stat && block_obj->hw_ops->handle_poison_consumption) { + poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); + need_reset = poison_stat; + } + } + + /* gpu reset is fallback for all failed cases */ + if (need_reset) + amdgpu_ras_reset_gpu(adev); +} + static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { @@ -1567,6 +1599,8 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); + else + amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry); } else { if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) amdgpu_ras_interrupt_umc_handler(obj, &entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 606df8869b89..c4b61785ab5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -509,6 +509,7 @@ struct amdgpu_ras_block_hw_ops { void (*reset_ras_error_count)(struct amdgpu_device *adev); void (*reset_ras_error_status)(struct amdgpu_device *adev); bool (*query_poison_status)(struct amdgpu_device *adev); + bool (*handle_poison_consumption)(struct amdgpu_device *adev); }; /* work flow -- GitLab