提交 640ae42e 编写于 作者: J John Clements 提交者: Alex Deucher

drm/amdgpu: Updated RAS infrastructure

Update RAS infrastructure to support RAS query for MCA subblocks
Reviewed-by: NHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: NJohn Clements <john.clements@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 6effad8a
...@@ -31,7 +31,7 @@ void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, ...@@ -31,7 +31,7 @@ void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr, uint64_t mc_status_addr,
unsigned long *error_count) unsigned long *error_count)
{ {
uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4); uint64_t mc_status = RREG64_PCIE(mc_status_addr);
if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
...@@ -42,7 +42,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, ...@@ -42,7 +42,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr, uint64_t mc_status_addr,
unsigned long *error_count) unsigned long *error_count)
{ {
uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4); uint64_t mc_status = RREG64_PCIE(mc_status_addr);
if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
(REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
...@@ -56,7 +56,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, ...@@ -56,7 +56,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
void amdgpu_mca_reset_error_count(struct amdgpu_device *adev, void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr) uint64_t mc_status_addr)
{ {
WREG64_PCIE(mc_status_addr * 4, 0x0ULL); WREG64_PCIE(mc_status_addr, 0x0ULL);
} }
void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev, void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
...@@ -87,8 +87,8 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev, ...@@ -87,8 +87,8 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
if (!mca_dev->ras_if) if (!mca_dev->ras_if)
return -ENOMEM; return -ENOMEM;
mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block; mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
mca_dev->ras_if->sub_block_index = mca_dev->ras_funcs->ras_sub_block;
mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
mca_dev->ras_if->sub_block_index = 0;
} }
ih_info.head = fs_info.head = *mca_dev->ras_if; ih_info.head = fs_info.head = *mca_dev->ras_if;
r = amdgpu_ras_late_init(adev, mca_dev->ras_if, r = amdgpu_ras_late_init(adev, mca_dev->ras_if,
......
...@@ -29,6 +29,7 @@ struct amdgpu_mca_ras_funcs { ...@@ -29,6 +29,7 @@ struct amdgpu_mca_ras_funcs {
void (*query_ras_error_address)(struct amdgpu_device *adev, void (*query_ras_error_address)(struct amdgpu_device *adev,
void *ras_error_status); void *ras_error_status);
uint32_t ras_block; uint32_t ras_block;
uint32_t ras_sub_block;
const char* sysfs_name; const char* sysfs_name;
}; };
......
...@@ -61,9 +61,30 @@ const char *ras_block_string[] = { ...@@ -61,9 +61,30 @@ const char *ras_block_string[] = {
"mp0", "mp0",
"mp1", "mp1",
"fuse", "fuse",
"mpio", "mca",
}; };
const char *ras_mca_block_string[] = {
"mca_mp0",
"mca_mp1",
"mca_mpio",
"mca_iohc",
};
const char *get_ras_block_str(struct ras_common_if *ras_block)
{
if (!ras_block)
return "NULL";
if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
return "OUT OF RANGE";
if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
return ras_mca_block_string[ras_block->sub_block_index];
return ras_block_string[ras_block->block];
}
#define ras_err_str(i) (ras_error_string[ffs(i)]) #define ras_err_str(i) (ras_error_string[ffs(i)])
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
...@@ -188,7 +209,7 @@ static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) ...@@ -188,7 +209,7 @@ static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
*block_id = i; *block_id = i;
if (strcmp(name, ras_block_str(i)) == 0) if (strcmp(name, ras_block_string[i]) == 0)
return 0; return 0;
} }
return -EINVAL; return -EINVAL;
...@@ -510,7 +531,6 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, ...@@ -510,7 +531,6 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
if (amdgpu_ras_query_error_status(obj->adev, &info)) if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL; return -EINVAL;
if (obj->adev->asic_type == CHIP_ALDEBARAN) { if (obj->adev->asic_type == CHIP_ALDEBARAN) {
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
DRM_WARN("Failed to reset error counter and error status"); DRM_WARN("Failed to reset error counter and error status");
...@@ -530,7 +550,7 @@ static inline void put_obj(struct ras_manager *obj) ...@@ -530,7 +550,7 @@ static inline void put_obj(struct ras_manager *obj)
if (obj && (--obj->use == 0)) if (obj && (--obj->use == 0))
list_del(&obj->node); list_del(&obj->node);
if (obj && (obj->use < 0)) if (obj && (obj->use < 0))
DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block)); DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
} }
/* make one obj and return it. */ /* make one obj and return it. */
...@@ -546,7 +566,14 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, ...@@ -546,7 +566,14 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
if (head->block >= AMDGPU_RAS_BLOCK_COUNT) if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
return NULL; return NULL;
obj = &con->objs[head->block]; if (head->block == AMDGPU_RAS_BLOCK__MCA) {
if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
return NULL;
obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
} else
obj = &con->objs[head->block];
/* already exist. return obj? */ /* already exist. return obj? */
if (alive_obj(obj)) if (alive_obj(obj))
return NULL; return NULL;
...@@ -574,19 +601,21 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, ...@@ -574,19 +601,21 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
if (head->block >= AMDGPU_RAS_BLOCK_COUNT) if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
return NULL; return NULL;
obj = &con->objs[head->block]; if (head->block == AMDGPU_RAS_BLOCK__MCA) {
if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
return NULL;
obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
} else
obj = &con->objs[head->block];
if (alive_obj(obj)) { if (alive_obj(obj))
WARN_ON(head->block != obj->head.block);
return obj; return obj;
}
} else { } else {
for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
obj = &con->objs[i]; obj = &con->objs[i];
if (alive_obj(obj)) { if (alive_obj(obj))
WARN_ON(i != obj->head.block);
return obj; return obj;
}
} }
} }
...@@ -627,8 +656,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, ...@@ -627,8 +656,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
*/ */
if (!amdgpu_ras_is_feature_allowed(adev, head)) if (!amdgpu_ras_is_feature_allowed(adev, head))
return 0; return 0;
if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
return 0;
if (enable) { if (enable) {
if (!obj) { if (!obj) {
...@@ -679,18 +706,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, ...@@ -679,18 +706,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Do not enable if it is not allowed. */ /* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
/* Are we alerady in that state we are going to set? */
if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {
ret = 0;
goto out;
}
if (!amdgpu_ras_intr_triggered()) { if (!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(&adev->psp, info, enable); ret = psp_ras_enable_features(&adev->psp, info, enable);
if (ret) { if (ret) {
dev_err(adev->dev, "ras %s %s failed %d\n", dev_err(adev->dev, "ras %s %s failed %d\n",
enable ? "enable":"disable", enable ? "enable":"disable",
ras_block_str(head->block), get_ras_block_str(head),
ret); ret);
goto out; goto out;
} }
...@@ -732,7 +754,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, ...@@ -732,7 +754,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
if (!ret) if (!ret)
dev_info(adev->dev, dev_info(adev->dev,
"RAS INFO: %s setup object\n", "RAS INFO: %s setup object\n",
ras_block_str(head->block)); get_ras_block_str(head));
} }
} else { } else {
/* setup the object then issue a ras TA disable cmd.*/ /* setup the object then issue a ras TA disable cmd.*/
...@@ -782,17 +804,39 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, ...@@ -782,17 +804,39 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
bool bypass) bool bypass)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
int i; int i;
const enum amdgpu_ras_error_type default_ras_type = const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
AMDGPU_RAS_ERROR__NONE;
for (i = 0; i < ras_block_count; i++) { for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
struct ras_common_if head = { struct ras_common_if head = {
.block = i, .block = i,
.type = default_ras_type, .type = default_ras_type,
.sub_block_index = 0, .sub_block_index = 0,
}; };
if (i == AMDGPU_RAS_BLOCK__MCA)
continue;
if (bypass) {
/*
* bypass psp. vbios enable ras for us.
* so just create the obj
*/
if (__amdgpu_ras_feature_enable(adev, &head, 1))
break;
} else {
if (amdgpu_ras_feature_enable(adev, &head, 1))
break;
}
}
for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__MCA,
.type = default_ras_type,
.sub_block_index = i,
};
if (bypass) { if (bypass) {
/* /*
* bypass psp. vbios enable ras for us. * bypass psp. vbios enable ras for us.
...@@ -810,6 +854,32 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, ...@@ -810,6 +854,32 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
} }
/* feature ctl end */ /* feature ctl end */
void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
struct ras_common_if *ras_block,
struct ras_err_data *err_data)
{
switch (ras_block->sub_block_index) {
case AMDGPU_RAS_MCA_BLOCK__MP0:
if (adev->mca.mp0.ras_funcs &&
adev->mca.mp0.ras_funcs->query_ras_error_count)
adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data);
break;
case AMDGPU_RAS_MCA_BLOCK__MP1:
if (adev->mca.mp1.ras_funcs &&
adev->mca.mp1.ras_funcs->query_ras_error_count)
adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data);
break;
case AMDGPU_RAS_MCA_BLOCK__MPIO:
if (adev->mca.mpio.ras_funcs &&
adev->mca.mpio.ras_funcs->query_ras_error_count)
adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data);
break;
default:
break;
}
}
/* query/inject/cure begin */ /* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info) struct ras_query_if *info)
...@@ -873,6 +943,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, ...@@ -873,6 +943,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->hdp.ras_funcs->query_ras_error_count) adev->hdp.ras_funcs->query_ras_error_count)
adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
break; break;
case AMDGPU_RAS_BLOCK__MCA:
amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
break;
default: default:
break; break;
} }
...@@ -894,13 +967,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, ...@@ -894,13 +967,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->smuio.funcs->get_socket_id(adev), adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev), adev->smuio.funcs->get_die_id(adev),
obj->err_data.ce_count, obj->err_data.ce_count,
ras_block_str(info->head.block)); get_ras_block_str(&info->head));
} else { } else {
dev_info(adev->dev, "%ld correctable hardware errors " dev_info(adev->dev, "%ld correctable hardware errors "
"detected in %s block, no user " "detected in %s block, no user "
"action is needed.\n", "action is needed.\n",
obj->err_data.ce_count, obj->err_data.ce_count,
ras_block_str(info->head.block)); get_ras_block_str(&info->head));
} }
} }
if (err_data.ue_count) { if (err_data.ue_count) {
...@@ -913,12 +986,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, ...@@ -913,12 +986,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->smuio.funcs->get_socket_id(adev), adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev), adev->smuio.funcs->get_die_id(adev),
obj->err_data.ue_count, obj->err_data.ue_count,
ras_block_str(info->head.block)); get_ras_block_str(&info->head));
} else { } else {
dev_info(adev->dev, "%ld uncorrectable hardware errors " dev_info(adev->dev, "%ld uncorrectable hardware errors "
"detected in %s block\n", "detected in %s block\n",
obj->err_data.ue_count, obj->err_data.ue_count,
ras_block_str(info->head.block)); get_ras_block_str(&info->head));
} }
} }
...@@ -1028,9 +1101,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ...@@ -1028,9 +1101,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
case AMDGPU_RAS_BLOCK__SDMA: case AMDGPU_RAS_BLOCK__SDMA:
case AMDGPU_RAS_BLOCK__MMHUB: case AMDGPU_RAS_BLOCK__MMHUB:
case AMDGPU_RAS_BLOCK__PCIE_BIF: case AMDGPU_RAS_BLOCK__PCIE_BIF:
case AMDGPU_RAS_BLOCK__MP0: case AMDGPU_RAS_BLOCK__MCA:
case AMDGPU_RAS_BLOCK__MP1:
case AMDGPU_RAS_BLOCK__MPIO:
ret = psp_ras_trigger_error(&adev->psp, &block_info); ret = psp_ras_trigger_error(&adev->psp, &block_info);
break; break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL: case AMDGPU_RAS_BLOCK__XGMI_WAFL:
...@@ -1038,13 +1109,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ...@@ -1038,13 +1109,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
break; break;
default: default:
dev_info(adev->dev, "%s error injection is not supported yet\n", dev_info(adev->dev, "%s error injection is not supported yet\n",
ras_block_str(info->head.block)); get_ras_block_str(&info->head));
ret = -EINVAL; ret = -EINVAL;
} }
if (ret) if (ret)
dev_err(adev->dev, "ras inject %s failed %d\n", dev_err(adev->dev, "ras inject %s failed %d\n",
ras_block_str(info->head.block), ret); get_ras_block_str(&info->head), ret);
return ret; return ret;
} }
...@@ -1387,7 +1458,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) ...@@ -1387,7 +1458,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
if (amdgpu_ras_is_supported(adev, obj->head.block) && if (amdgpu_ras_is_supported(adev, obj->head.block) &&
(obj->attr_inuse == 1)) { (obj->attr_inuse == 1)) {
sprintf(fs_info.debugfs_name, "%s_err_inject", sprintf(fs_info.debugfs_name, "%s_err_inject",
ras_block_str(obj->head.block)); get_ras_block_str(&obj->head));
fs_info.head = obj->head; fs_info.head = obj->head;
amdgpu_ras_debugfs_create(adev, &fs_info, dir); amdgpu_ras_debugfs_create(adev, &fs_info, dir);
} }
...@@ -2185,7 +2256,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) ...@@ -2185,7 +2256,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
return 0; return 0;
con = kmalloc(sizeof(struct amdgpu_ras) + con = kmalloc(sizeof(struct amdgpu_ras) +
sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
GFP_KERNEL|__GFP_ZERO); GFP_KERNEL|__GFP_ZERO);
if (!con) if (!con)
return -ENOMEM; return -ENOMEM;
......
...@@ -49,15 +49,22 @@ enum amdgpu_ras_block { ...@@ -49,15 +49,22 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MP0, AMDGPU_RAS_BLOCK__MP0,
AMDGPU_RAS_BLOCK__MP1, AMDGPU_RAS_BLOCK__MP1,
AMDGPU_RAS_BLOCK__FUSE, AMDGPU_RAS_BLOCK__FUSE,
AMDGPU_RAS_BLOCK__MPIO, AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__LAST AMDGPU_RAS_BLOCK__LAST
}; };
extern const char *ras_block_string[]; enum amdgpu_ras_mca_block {
AMDGPU_RAS_MCA_BLOCK__MP0 = 0,
AMDGPU_RAS_MCA_BLOCK__MP1,
AMDGPU_RAS_MCA_BLOCK__MPIO,
AMDGPU_RAS_MCA_BLOCK__IOHC,
AMDGPU_RAS_MCA_BLOCK__LAST
};
#define ras_block_str(i) (ras_block_string[i])
#define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST
#define AMDGPU_RAS_MCA_BLOCK_COUNT AMDGPU_RAS_MCA_BLOCK__LAST
#define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) #define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
enum amdgpu_ras_gfx_subblock { enum amdgpu_ras_gfx_subblock {
...@@ -544,8 +551,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { ...@@ -544,8 +551,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
return TA_RAS_BLOCK__MP1; return TA_RAS_BLOCK__MP1;
case AMDGPU_RAS_BLOCK__FUSE: case AMDGPU_RAS_BLOCK__FUSE:
return TA_RAS_BLOCK__FUSE; return TA_RAS_BLOCK__FUSE;
case AMDGPU_RAS_BLOCK__MPIO: case AMDGPU_RAS_BLOCK__MCA:
return TA_RAS_BLOCK__MPIO; return TA_RAS_BLOCK__MCA;
default: default:
WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block); WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block);
return TA_RAS_BLOCK__UMC; return TA_RAS_BLOCK__UMC;
...@@ -640,4 +647,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev); ...@@ -640,4 +647,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev);
int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev); int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
const char *get_ras_block_str(struct ras_common_if *ras_block);
#endif #endif
...@@ -52,7 +52,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = { ...@@ -52,7 +52,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = {
.ras_fini = mca_v3_0_mp0_ras_fini, .ras_fini = mca_v3_0_mp0_ras_fini,
.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count, .query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
.query_ras_error_address = NULL, .query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MP0, .ras_block = AMDGPU_RAS_BLOCK__MCA,
.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP0,
.sysfs_name = "mp0_err_count", .sysfs_name = "mp0_err_count",
}; };
...@@ -79,7 +80,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = { ...@@ -79,7 +80,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = {
.ras_fini = mca_v3_0_mp1_ras_fini, .ras_fini = mca_v3_0_mp1_ras_fini,
.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count, .query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
.query_ras_error_address = NULL, .query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MP1, .ras_block = AMDGPU_RAS_BLOCK__MCA,
.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP1,
.sysfs_name = "mp1_err_count", .sysfs_name = "mp1_err_count",
}; };
...@@ -106,7 +108,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = { ...@@ -106,7 +108,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = {
.ras_fini = mca_v3_0_mpio_ras_fini, .ras_fini = mca_v3_0_mpio_ras_fini,
.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count, .query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
.query_ras_error_address = NULL, .query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MPIO, .ras_block = AMDGPU_RAS_BLOCK__MCA,
.ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MPIO,
.sysfs_name = "mpio_err_count", .sysfs_name = "mpio_err_count",
}; };
......
...@@ -387,13 +387,13 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device ...@@ -387,13 +387,13 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
"errors detected in %s block, " "errors detected in %s block, "
"no user action is needed.\n", "no user action is needed.\n",
obj->err_data.ce_count, obj->err_data.ce_count,
ras_block_str(adev->nbio.ras_if->block)); get_ras_block_str(adev->nbio.ras_if));
if (err_data.ue_count) if (err_data.ue_count)
dev_info(adev->dev, "%ld uncorrectable hardware " dev_info(adev->dev, "%ld uncorrectable hardware "
"errors detected in %s block\n", "errors detected in %s block\n",
obj->err_data.ue_count, obj->err_data.ue_count,
ras_block_str(adev->nbio.ras_if->block)); get_ras_block_str(adev->nbio.ras_if));
} }
dev_info(adev->dev, "RAS controller interrupt triggered " dev_info(adev->dev, "RAS controller interrupt triggered "
......
...@@ -73,10 +73,19 @@ enum ta_ras_block { ...@@ -73,10 +73,19 @@ enum ta_ras_block {
TA_RAS_BLOCK__MP0, TA_RAS_BLOCK__MP0,
TA_RAS_BLOCK__MP1, TA_RAS_BLOCK__MP1,
TA_RAS_BLOCK__FUSE, TA_RAS_BLOCK__FUSE,
TA_RAS_BLOCK__MPIO, TA_RAS_BLOCK__MCA,
TA_NUM_BLOCK_MAX TA_NUM_BLOCK_MAX
}; };
enum ta_ras_mca_block
{
TA_RAS_MCA_BLOCK__MP0 = 0,
TA_RAS_MCA_BLOCK__MP1 = 1,
TA_RAS_MCA_BLOCK__MPIO = 2,
TA_RAS_MCA_BLOCK__IOHC = 3,
TA_MCA_NUM_BLOCK_MAX
};
enum ta_ras_error_type { enum ta_ras_error_type {
TA_RAS_ERROR__NONE = 0, TA_RAS_ERROR__NONE = 0,
TA_RAS_ERROR__PARITY = 1, TA_RAS_ERROR__PARITY = 1,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册