提交 8b0fb0e9 编写于 作者: Y yipechai 提交者: Alex Deucher

drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops

1.Modify gfx block to fit for the unified ras block data and ops.
2.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the corresponding variable name remove _funcs suffix.
3.Remove the const flag of gfx ras variable so that gfx ras block can be able to be inserted into amdgpu device ras block link list.
4.Invoke amdgpu_ras_register_ras_block function to register gfx ras block into amdgpu device ras block link list.
5.Remove the redundant code about gfx in amdgpu_ras.c after using the unified ras block.
6.Fill unified ras block .name .block .ras_late_init and .ras_fini for all of gfx versions. If .ras_late_init and .ras_fini had been defined by the selected gfx version, the defined functions will take effect; if not defined, default fill with amdgpu_gfx_ras_late_init and amdgpu_gfx_ras_fini.
Signed-off-by: Nyipechai <YiPeng.Chai@amd.com>
Reviewed-by: NHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: NJohn Clements <john.clements@amd.com>
Reviewed-by: NTao Zhou <tao.zhou1@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 7cab2124
......@@ -622,7 +622,7 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value)
return r;
}
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev)
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, void *ras_info)
{
int r;
struct ras_fs_if fs_info = {
......@@ -695,9 +695,9 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
*/
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->query_ras_error_count)
adev->gfx.ras_funcs->query_ras_error_count(adev, err_data);
if (adev->gfx.ras && adev->gfx.ras->ras_block.hw_ops &&
adev->gfx.ras->ras_block.hw_ops->query_ras_error_count)
adev->gfx.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
amdgpu_ras_reset_gpu(adev);
}
return AMDGPU_RAS_SUCCESS;
......
......@@ -31,6 +31,7 @@
#include "amdgpu_ring.h"
#include "amdgpu_rlc.h"
#include "soc15.h"
#include "amdgpu_ras.h"
/* GFX current status */
#define AMDGPU_GFX_NORMAL_MODE 0x00000000L
......@@ -198,16 +199,8 @@ struct amdgpu_cu_info {
uint32_t bitmap[4][4];
};
struct amdgpu_gfx_ras_funcs {
int (*ras_late_init)(struct amdgpu_device *adev);
void (*ras_fini)(struct amdgpu_device *adev);
int (*ras_error_inject)(struct amdgpu_device *adev,
void *inject_if);
int (*query_ras_error_count)(struct amdgpu_device *adev,
void *ras_error_status);
void (*reset_ras_error_count)(struct amdgpu_device *adev);
void (*query_ras_error_status)(struct amdgpu_device *adev);
void (*reset_ras_error_status)(struct amdgpu_device *adev);
struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
};
......@@ -331,7 +324,7 @@ struct amdgpu_gfx {
/*ras */
struct ras_common_if *ras_if;
const struct amdgpu_gfx_ras_funcs *ras_funcs;
struct amdgpu_gfx_ras *ras;
};
#define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
......@@ -393,7 +386,7 @@ bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device *adev, int me,
int pipe, int queue);
void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable);
int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value);
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev);
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, void *ras_info);
void amdgpu_gfx_ras_fini(struct amdgpu_device *adev);
int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
......
......@@ -89,6 +89,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
return ras_block_string[ras_block->block];
}
#define ras_block_str(_BLOCK_) (((_BLOCK_) < (sizeof(*ras_block_string)/sizeof(const char*))) ? ras_block_string[_BLOCK_] : "Out Of Range")
#define ras_err_str(i) (ras_error_string[ffs(i)])
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
......@@ -962,6 +964,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
{
struct amdgpu_ras_block_object* block_obj = NULL;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data = {0, 0, 0, NULL};
int i;
......@@ -969,6 +972,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
if (!obj)
return -EINVAL;
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
amdgpu_ras_get_ecc_info(adev, &err_data);
......@@ -981,13 +986,16 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
}
break;
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->query_ras_error_count)
adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data);
if (!block_obj || !block_obj->hw_ops) {
dev_info(adev->dev, "%s doesn't config ras function \n",
get_ras_block_str(&info->head));
return -EINVAL;
}
if (block_obj->hw_ops->query_ras_error_count)
block_obj->hw_ops->query_ras_error_count(adev, &err_data);
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->query_ras_error_status)
adev->gfx.ras_funcs->query_ras_error_status(adev);
if (block_obj->hw_ops->query_ras_error_status)
block_obj->hw_ops->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.ras_funcs &&
......@@ -1074,18 +1082,23 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
{
struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
if (!amdgpu_ras_is_supported(adev, block))
return -EINVAL;
switch (block) {
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->reset_ras_error_count)
adev->gfx.ras_funcs->reset_ras_error_count(adev);
if (!block_obj || !block_obj->hw_ops) {
dev_info(adev->dev, "%s doesn't config ras function \n", ras_block_str(block));
return -EINVAL;
}
if (block_obj->hw_ops->reset_ras_error_count)
block_obj->hw_ops->reset_ras_error_count(adev);
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->reset_ras_error_status)
adev->gfx.ras_funcs->reset_ras_error_status(adev);
if (block_obj->hw_ops->reset_ras_error_status)
block_obj->hw_ops->reset_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.ras_funcs &&
......@@ -1150,7 +1163,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
.address = info->address,
.value = info->value,
};
int ret = 0;
int ret = -EINVAL;
struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, info->head.sub_block_index);
if (!obj)
return -EINVAL;
......@@ -1164,11 +1178,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->ras_error_inject)
ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
else
ret = -EINVAL;
if (!block_obj || !block_obj->hw_ops) {
dev_info(adev->dev, "%s doesn't config ras function \n", get_ras_block_str(&info->head));
return -EINVAL;
}
if (block_obj->hw_ops->ras_error_inject)
ret = block_obj->hw_ops->ras_error_inject(adev, info);
break;
case AMDGPU_RAS_BLOCK__UMC:
case AMDGPU_RAS_BLOCK__SDMA:
......@@ -1800,15 +1816,20 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
struct ras_query_if *info)
{
struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, info->head.sub_block_index);
/*
* Only two block need to query read/write
* RspStatus at current state
*/
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->query_ras_error_status)
adev->gfx.ras_funcs->query_ras_error_status(adev);
if (!block_obj || !block_obj->hw_ops) {
dev_info(adev->dev, "%s doesn't config ras function \n", get_ras_block_str(&info->head));
return ;
}
if (block_obj->hw_ops->query_ras_error_status)
block_obj->hw_ops->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.ras_funcs &&
......
......@@ -882,7 +882,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status);
static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
void *inject_if);
......@@ -2197,12 +2197,16 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
.select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
};
static const struct amdgpu_gfx_ras_funcs gfx_v9_0_ras_funcs = {
.ras_late_init = amdgpu_gfx_ras_late_init,
.ras_fini = amdgpu_gfx_ras_fini,
.ras_error_inject = &gfx_v9_0_ras_error_inject,
.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
const struct amdgpu_ras_block_hw_ops gfx_v9_0_ras_ops = {
.ras_error_inject = &gfx_v9_0_ras_error_inject,
.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
};
static struct amdgpu_gfx_ras gfx_v9_0_ras = {
.ras_block = {
.hw_ops = &gfx_v9_0_ras_ops,
},
};
static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
......@@ -2231,7 +2235,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
DRM_INFO("fix gfx.config for vega12\n");
break;
case IP_VERSION(9, 4, 0):
adev->gfx.ras_funcs = &gfx_v9_0_ras_funcs;
adev->gfx.ras = &gfx_v9_0_ras;
adev->gfx.config.max_hw_contexts = 8;
adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
......@@ -2258,7 +2262,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
break;
case IP_VERSION(9, 4, 1):
adev->gfx.ras_funcs = &gfx_v9_4_ras_funcs;
adev->gfx.ras = &gfx_v9_4_ras;
adev->gfx.config.max_hw_contexts = 8;
adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
......@@ -2279,7 +2283,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
gb_addr_config |= 0x22010042;
break;
case IP_VERSION(9, 4, 2):
adev->gfx.ras_funcs = &gfx_v9_4_2_ras_funcs;
adev->gfx.ras = &gfx_v9_4_2_ras;
adev->gfx.config.max_hw_contexts = 8;
adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
......@@ -2298,6 +2302,25 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
break;
}
if (adev->gfx.ras) {
err = amdgpu_ras_register_ras_block(adev, &adev->gfx.ras->ras_block);
if (err) {
DRM_ERROR("Failed to register gfx ras block!\n");
return err;
}
strcpy(adev->gfx.ras->ras_block.name,"gfx");
adev->gfx.ras->ras_block.block = AMDGPU_RAS_BLOCK__GFX;
/* If not define special ras_late_init function, use gfx default ras_late_init */
if (!adev->gfx.ras->ras_block.ras_late_init)
adev->gfx.ras->ras_block.ras_late_init = amdgpu_gfx_ras_late_init;
/* If not define special ras_fini function, use gfx default ras_fini */
if (!adev->gfx.ras->ras_block.ras_fini)
adev->gfx.ras->ras_block.ras_fini = amdgpu_gfx_ras_fini;
}
adev->gfx.config.gb_addr_config = gb_addr_config;
adev->gfx.config.gb_addr_config_fields.num_pipes = 1 <<
......@@ -2513,9 +2536,8 @@ static int gfx_v9_0_sw_fini(void *handle)
int i;
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->ras_fini)
adev->gfx.ras_funcs->ras_fini(adev);
if (adev->gfx.ras && adev->gfx.ras->ras_block.ras_fini)
adev->gfx.ras->ras_block.ras_fini(adev);
for (i = 0; i < adev->gfx.num_gfx_rings; i++)
amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
......@@ -4870,16 +4892,15 @@ static int gfx_v9_0_ecc_late_init(void *handle)
if (r)
return r;
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->ras_late_init) {
r = adev->gfx.ras_funcs->ras_late_init(adev);
if (adev->gfx.ras && adev->gfx.ras->ras_block.ras_late_init) {
r = adev->gfx.ras->ras_block.ras_late_init(adev, NULL);
if (r)
return r;
}
if (adev->gfx.ras_funcs &&
adev->gfx.ras_funcs->enable_watchdog_timer)
adev->gfx.ras_funcs->enable_watchdog_timer(adev);
if (adev->gfx.ras &&
adev->gfx.ras->enable_watchdog_timer)
adev->gfx.ras->enable_watchdog_timer(adev);
return 0;
}
......@@ -6819,7 +6840,7 @@ static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev)
WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);
}
static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
......@@ -6828,7 +6849,7 @@ static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
uint32_t reg_value;
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
return -EINVAL;
return;
err_data->ue_count = 0;
err_data->ce_count = 0;
......@@ -6857,8 +6878,6 @@ static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
mutex_unlock(&adev->grbm_idx_mutex);
gfx_v9_0_query_utc_edc_status(adev, err_data);
return 0;
}
static void gfx_v9_0_emit_mem_sync(struct amdgpu_ring *ring)
......
......@@ -863,7 +863,7 @@ static int gfx_v9_4_ras_error_count(struct amdgpu_device *adev,
return 0;
}
static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
static void gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
......@@ -872,7 +872,7 @@ static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
uint32_t reg_value;
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
return -EINVAL;
return;
err_data->ue_count = 0;
err_data->ce_count = 0;
......@@ -903,7 +903,6 @@ static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
gfx_v9_4_query_utc_edc_status(adev, err_data);
return 0;
}
static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
......@@ -1029,11 +1028,16 @@ static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex);
}
const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs = {
.ras_late_init = amdgpu_gfx_ras_late_init,
.ras_fini = amdgpu_gfx_ras_fini,
.ras_error_inject = &gfx_v9_4_ras_error_inject,
.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
.query_ras_error_status = &gfx_v9_4_query_ras_error_status,
const struct amdgpu_ras_block_hw_ops gfx_v9_4_ras_ops = {
.ras_error_inject = &gfx_v9_4_ras_error_inject,
.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
.query_ras_error_status = &gfx_v9_4_query_ras_error_status,
};
struct amdgpu_gfx_ras gfx_v9_4_ras = {
.ras_block = {
.hw_ops = &gfx_v9_4_ras_ops,
},
};
......@@ -24,6 +24,6 @@
#ifndef __GFX_V9_4_H__
#define __GFX_V9_4_H__
extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs;
extern struct amdgpu_gfx_ras gfx_v9_4_ras;
#endif /* __GFX_V9_4_H__ */
......@@ -1641,14 +1641,14 @@ static int gfx_v9_4_2_query_utc_edc_count(struct amdgpu_device *adev,
return 0;
}
static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
static void gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
uint32_t sec_count = 0, ded_count = 0;
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
return -EINVAL;
return;
err_data->ue_count = 0;
err_data->ce_count = 0;
......@@ -1661,7 +1661,6 @@ static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
err_data->ce_count += sec_count;
err_data->ue_count += ded_count;
return 0;
}
static void gfx_v9_4_2_reset_utc_err_status(struct amdgpu_device *adev)
......@@ -1931,13 +1930,17 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex);
}
const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs = {
.ras_late_init = amdgpu_gfx_ras_late_init,
.ras_fini = amdgpu_gfx_ras_fini,
.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops ={
.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
};
struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
.ras_block = {
.hw_ops = &gfx_v9_4_2_ras_ops,
},
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
};
......@@ -31,6 +31,6 @@ void gfx_v9_4_2_init_golden_registers(struct amdgpu_device *adev,
void gfx_v9_4_2_set_power_brake_sequence(struct amdgpu_device *adev);
int gfx_v9_4_2_do_edc_gpr_workarounds(struct amdgpu_device *adev);
extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs;
extern struct amdgpu_gfx_ras gfx_v9_4_2_ras;
#endif /* __GFX_V9_4_2_H__ */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册