diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index dc9b5bc3431bf5f885ea328183c8458687287f62..e564aa32a4142212f6fc49cdd5fd4d502ef91865 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -91,6 +91,7 @@ enum HNAE3_DEV_CAP_BITS { HNAE3_DEV_SUPPORT_STASH_B, HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B, HNAE3_DEV_SUPPORT_PAUSE_B, + HNAE3_DEV_SUPPORT_RAS_IMP_B, HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, HNAE3_DEV_SUPPORT_PORT_VLAN_BYPASS_B, HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, @@ -129,6 +130,9 @@ enum HNAE3_DEV_CAP_BITS { #define hnae3_dev_phy_imp_supported(hdev) \ test_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, (hdev)->ae_dev->caps) +#define hnae3_dev_ras_imp_supported(hdev) \ + test_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, (hdev)->ae_dev->caps) + #define hnae3_dev_tqp_txrx_indep_supported(hdev) \ test_bit(HNAE3_DEV_SUPPORT_TQP_TXRX_INDEP_B, (hdev)->ae_dev->caps) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index cf1efd2f4a0fc4d8a23c04bce2c5713e641ad7b4..a0edca848392331b6bff78980466233ba892dbe4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -349,6 +349,9 @@ static struct hns3_dbg_cap_info hns3_dbg_cap[] = { }, { .name = "support imp-controlled PHY", .cap_bit = HNAE3_DEV_SUPPORT_PHY_IMP_B, + }, { + .name = "support imp-controlled RAS", + .cap_bit = HNAE3_DEV_SUPPORT_RAS_IMP_B, }, { .name = "support rxd advanced layout", .cap_bit = HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 8f6ed8577aea2e2f4cdb0d72b1aab2e4b87a6ab5..887297e37cf3399f35e10876cadc5fbfdae2101d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -178,7 +178,8 @@ static bool hclge_is_special_opcode(u16 opcode) HCLGE_QUERY_CLEAR_MPF_RAS_INT, HCLGE_QUERY_CLEAR_PF_RAS_INT, HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT, - HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT}; + HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT, + HCLGE_QUERY_ALL_ERR_INFO}; int i; for (i = 0; i < ARRAY_SIZE(spec_opcode); i++) { @@ -386,6 +387,8 @@ static void hclge_parse_capability(struct hclge_dev *hdev, set_bit(HNAE3_DEV_SUPPORT_PAUSE_B, ae_dev->caps); if (hnae3_get_bit(caps, HCLGE_CAP_PHY_IMP_B)) set_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, ae_dev->caps); + if (hnae3_get_bit(caps, HCLGE_CAP_RAS_IMP_B)) + set_bit(HNAE3_DEV_SUPPORT_RAS_IMP_B, ae_dev->caps); if (hnae3_get_bit(caps, HCLGE_CAP_RXD_ADV_LAYOUT_B)) set_bit(HNAE3_DEV_SUPPORT_RXD_ADV_LAYOUT_B, ae_dev->caps); if (hnae3_get_bit(caps, HCLGE_CAP_PORT_VLAN_BYPASS_B)) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index da78a6477e46a25094b51c2750cf171d525597be..221811af9473986ad146c9f1864d30da2412668e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -293,6 +293,8 @@ enum hclge_opcode_type { HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513, HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514, HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515, + HCLGE_QUERY_ALL_ERR_BD_NUM = 0x1516, + HCLGE_QUERY_ALL_ERR_INFO = 0x1517, HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580, HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581, HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584, @@ -390,6 +392,7 @@ enum HCLGE_CAP_BITS { HCLGE_CAP_HW_PAD_B, HCLGE_CAP_STASH_B, HCLGE_CAP_UDP_TUNNEL_CSUM_B, + HCLGE_CAP_RAS_IMP_B = 12, HCLGE_CAP_FEC_B = 13, HCLGE_CAP_PAUSE_B = 14, HCLGE_CAP_RXD_ADV_LAYOUT_B = 15, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index f125aa4258729946eba6c6a13e3ec7b9930509a8..bad9fda19398bab5bb664cae7ffb9bc833ef93c8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -631,6 +631,134 @@ static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = { { /* sentinel */ } }; +static const struct hclge_hw_module_id hclge_hw_module_id_st[] = { + { + .module_id = MODULE_NONE, + .msg = "MODULE_NONE" + }, { + .module_id = MODULE_BIOS_COMMON, + .msg = "MODULE_BIOS_COMMON" + }, { + .module_id = MODULE_GE, + .msg = "MODULE_GE" + }, { + .module_id = MODULE_IGU_EGU, + .msg = "MODULE_IGU_EGU" + }, { + .module_id = MODULE_LGE, + .msg = "MODULE_LGE" + }, { + .module_id = MODULE_NCSI, + .msg = "MODULE_NCSI" + }, { + .module_id = MODULE_PPP, + .msg = "MODULE_PPP" + }, { + .module_id = MODULE_QCN, + .msg = "MODULE_QCN" + }, { + .module_id = MODULE_RCB_RX, + .msg = "MODULE_RCB_RX" + }, { + .module_id = MODULE_RTC, + .msg = "MODULE_RTC" + }, { + .module_id = MODULE_SSU, + .msg = "MODULE_SSU" + }, { + .module_id = MODULE_TM, + .msg = "MODULE_TM" + }, { + .module_id = MODULE_RCB_TX, + .msg = "MODULE_RCB_TX" + }, { + .module_id = MODULE_TXDMA, + .msg = "MODULE_TXDMA" + }, { + .module_id = MODULE_MASTER, + .msg = "MODULE_MASTER" + }, { + .module_id = MODULE_ROCEE_TOP, + .msg = "MODULE_ROCEE_TOP" + }, { + .module_id = MODULE_ROCEE_TIMER, + .msg = "MODULE_ROCEE_TIMER" + }, { + .module_id = MODULE_ROCEE_MDB, + .msg = "MODULE_ROCEE_MDB" + }, { + .module_id = MODULE_ROCEE_TSP, + .msg = "MODULE_ROCEE_TSP" + }, { + .module_id = MODULE_ROCEE_TRP, + .msg = "MODULE_ROCEE_TRP" + }, { + .module_id = MODULE_ROCEE_SCC, + .msg = "MODULE_ROCEE_SCC" + }, { + .module_id = MODULE_ROCEE_CAEP, + .msg = "MODULE_ROCEE_CAEP" + }, { + .module_id = MODULE_ROCEE_GEN_AC, + .msg = "MODULE_ROCEE_GEN_AC" + }, { + .module_id = MODULE_ROCEE_QMM, + .msg = "MODULE_ROCEE_QMM" + }, { + .module_id = MODULE_ROCEE_LSAN, + .msg = "MODULE_ROCEE_LSAN" + } +}; + +static const struct hclge_hw_type_id hclge_hw_type_id_st[] = { + { + .type_id = NONE_ERROR, + .msg = "none_error" + }, { + .type_id = FIFO_ERROR, + .msg = "fifo_error" + }, { + .type_id = MEMORY_ERROR, + .msg = "memory_error" + }, { + .type_id = POISON_ERROR, + .msg = "poison_error" + }, { + .type_id = MSIX_ECC_ERROR, + .msg = "msix_ecc_error" + }, { + .type_id = TQP_INT_ECC_ERROR, + .msg = "tqp_int_ecc_error" + }, { + .type_id = PF_ABNORMAL_INT_ERROR, + .msg = "pf_abnormal_int_error" + }, { + .type_id = MPF_ABNORMAL_INT_ERROR, + .msg = "mpf_abnormal_int_error" + }, { + .type_id = COMMON_ERROR, + .msg = "common_error" + }, { + .type_id = PORT_ERROR, + .msg = "port_error" + }, { + .type_id = ETS_ERROR, + .msg = "ets_error" + }, { + .type_id = NCSI_ERROR, + .msg = "ncsi_error" + }, { + .type_id = GLB_ERROR, + .msg = "glb_error" + }, { + .type_id = ROCEE_NORMAL_ERR, + .msg = "rocee_normal_error" + }, { + .type_id = ROCEE_OVF_ERR, + .msg = "rocee_ovf_error" + } +}; + static void hclge_log_error(struct device *dev, char *reg, const struct hclge_hw_error *err, u32 err_sts, unsigned long *reset_requests) @@ -1611,11 +1739,27 @@ static const struct hclge_hw_blk hw_blk[] = { { /* sentinel */ } }; +static void hclge_config_all_msix_error(struct hclge_dev *hdev, bool enable) +{ + u32 reg_val; + + reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG); + + if (enable) + reg_val |= BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B); + else + reg_val &= ~BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B); + + hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val); +} + int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state) { const struct hclge_hw_blk *module = hw_blk; int ret = 0; + hclge_config_all_msix_error(hdev, state); + while (module->name) { if (module->config_err_int) { ret = module->config_err_int(hdev, state); @@ -1876,11 +2020,8 @@ static int hclge_handle_pf_msix_error(struct hclge_dev *hdev, static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev, unsigned long *reset_requests) { - struct hclge_mac_tnl_stats mac_tnl_stats; - struct device *dev = &hdev->pdev->dev; u32 mpf_bd_num, pf_bd_num, bd_num; struct hclge_desc *desc; - u32 status; int ret; /* query the number of bds for the MSIx int status */ @@ -1903,29 +2044,7 @@ static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev, if (ret) goto msi_error; - /* query and clear mac tnl interruptions */ - hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_QUERY_MAC_TNL_INT, - true); - ret = hclge_cmd_send(&hdev->hw, &desc[0], 1); - if (ret) { - dev_err(dev, "query mac tnl int cmd failed (%d)\n", ret); - goto msi_error; - } - - status = le32_to_cpu(desc->data[0]); - if (status) { - /* When mac tnl interrupt occurs, we record current time and - * register status here in a fifo, then clear the status. So - * that if link status changes suddenly at some time, we can - * query them by debugfs. - */ - mac_tnl_stats.time = local_clock(); - mac_tnl_stats.status = status; - kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats); - ret = hclge_clear_mac_tnl_int(hdev); - if (ret) - dev_err(dev, "clear mac tnl int failed (%d)\n", ret); - } + ret = hclge_handle_mac_tnl(hdev); msi_error: kfree(desc); @@ -1947,10 +2066,43 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev, return hclge_handle_all_hw_msix_error(hdev, reset_requests); } -void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev) +int hclge_handle_mac_tnl(struct hclge_dev *hdev) { -#define HCLGE_DESC_NO_DATA_LEN 8 + struct hclge_mac_tnl_stats mac_tnl_stats; + struct device *dev = &hdev->pdev->dev; + struct hclge_desc desc; + u32 status; + int ret; + + /* query and clear mac tnl interruptions */ + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_MAC_TNL_INT, true); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(dev, "failed to query mac tnl int, ret = %d.\n", ret); + return ret; + } + status = le32_to_cpu(desc.data[0]); + if (status) { + /* When mac tnl interrupt occurs, we record current time and + * register status here in a fifo, then clear the status. So + * that if link status changes suddenly at some time, we can + * query them by debugfs. + */ + mac_tnl_stats.time = local_clock(); + mac_tnl_stats.status = status; + kfifo_put(&hdev->mac_tnl_log, mac_tnl_stats); + ret = hclge_clear_mac_tnl_int(hdev); + if (ret) + dev_err(dev, "failed to clear mac tnl int, ret = %d.\n", + ret); + } + + return ret; +} + +void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev) +{ struct hclge_dev *hdev = ae_dev->priv; struct device *dev = &hdev->pdev->dev; u32 mpf_bd_num, pf_bd_num, bd_num; @@ -1999,3 +2151,205 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev) msi_error: kfree(desc); } + +bool hclge_find_error_source(struct hclge_dev *hdev) +{ + u32 msix_src_flag, hw_err_src_flag; + + msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) & + HCLGE_VECTOR0_REG_MSIX_MASK; + + hw_err_src_flag = hclge_read_dev(&hdev->hw, + HCLGE_RAS_PF_OTHER_INT_STS_REG) & + HCLGE_RAS_REG_ERR_MASK; + + return msix_src_flag || hw_err_src_flag; +} + +void hclge_handle_occurred_error(struct hclge_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + + if (hclge_find_error_source(hdev)) + hclge_handle_error_info_log(ae_dev); +} + +static void +hclge_handle_error_type_reg_log(struct device *dev, + struct hclge_mod_err_info *mod_info, + struct hclge_type_reg_err_info *type_reg_info) +{ +#define HCLGE_ERR_TYPE_MASK 0x7F +#define HCLGE_ERR_TYPE_IS_RAS_OFFSET 7 + + u8 mod_id, total_module, type_id, total_type, i, is_ras; + u8 index_module = MODULE_NONE; + u8 index_type = NONE_ERROR; + + mod_id = mod_info->mod_id; + type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK; + is_ras = type_reg_info->type_id >> HCLGE_ERR_TYPE_IS_RAS_OFFSET; + + total_module = ARRAY_SIZE(hclge_hw_module_id_st); + total_type = ARRAY_SIZE(hclge_hw_type_id_st); + + for (i = 0; i < total_module; i++) { + if (mod_id == hclge_hw_module_id_st[i].module_id) { + index_module = i; + break; + } + } + + for (i = 0; i < total_type; i++) { + if (type_id == hclge_hw_type_id_st[i].type_id) { + index_type = i; + break; + } + } + + if (index_module != MODULE_NONE && index_type != NONE_ERROR) + dev_err(dev, + "found %s %s, is %s error.\n", + hclge_hw_module_id_st[index_module].msg, + hclge_hw_type_id_st[index_type].msg, + is_ras ? "ras" : "msix"); + else + dev_err(dev, + "unknown module[%u] or type[%u].\n", mod_id, type_id); + + dev_err(dev, "reg_value:\n"); + for (i = 0; i < type_reg_info->reg_num; i++) + dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]); +} + +static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev, + const u32 *buf, u32 buf_size) +{ + struct hclge_type_reg_err_info *type_reg_info; + struct hclge_dev *hdev = ae_dev->priv; + struct device *dev = &hdev->pdev->dev; + struct hclge_mod_err_info *mod_info; + struct hclge_sum_err_info *sum_info; + u8 mod_num, err_num, i; + u32 offset = 0; + + sum_info = (struct hclge_sum_err_info *)&buf[offset++]; + if (sum_info->reset_type && + sum_info->reset_type != HNAE3_NONE_RESET) + set_bit(sum_info->reset_type, &ae_dev->hw_err_reset_req); + mod_num = sum_info->mod_num; + + while (mod_num--) { + if (offset >= buf_size) { + dev_err(dev, "The offset(%u) exceeds buf's size(%u).\n", + offset, buf_size); + return; + } + mod_info = (struct hclge_mod_err_info *)&buf[offset++]; + err_num = mod_info->err_num; + + for (i = 0; i < err_num; i++) { + if (offset >= buf_size) { + dev_err(dev, + "The offset(%u) exceeds buf size(%u).\n", + offset, buf_size); + return; + } + + type_reg_info = (struct hclge_type_reg_err_info *) + &buf[offset++]; + hclge_handle_error_type_reg_log(dev, mod_info, + type_reg_info); + + offset += type_reg_info->reg_num; + } + } +} + +static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num) +{ + struct device *dev = &hdev->pdev->dev; + struct hclge_desc desc_bd; + int ret; + + hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_ALL_ERR_BD_NUM, true); + ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1); + if (ret) { + dev_err(dev, "failed to query error bd_num, ret = %d.\n", ret); + return ret; + } + + *bd_num = le32_to_cpu(desc_bd.data[0]); + if (!(*bd_num)) { + dev_err(dev, "The value of bd_num is 0!\n"); + return -EINVAL; + } + + return 0; +} + +static int hclge_query_all_err_info(struct hclge_dev *hdev, + struct hclge_desc *desc, u32 bd_num) +{ + struct device *dev = &hdev->pdev->dev; + int ret; + + hclge_cmd_setup_basic_desc(desc, HCLGE_QUERY_ALL_ERR_INFO, true); + ret = hclge_cmd_send(&hdev->hw, desc, bd_num); + if (ret) + dev_err(dev, "failed to query error info, ret = %d.\n", ret); + + return ret; +} + +int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev) +{ + u32 bd_num, desc_len, buf_len, buf_size, i; + struct hclge_dev *hdev = ae_dev->priv; + struct hclge_desc *desc; + __le32 *desc_data; + u32 *buf; + int ret; + + ret = hclge_query_all_err_bd_num(hdev, &bd_num); + if (ret) + goto out; + + desc_len = bd_num * sizeof(struct hclge_desc); + desc = kzalloc(desc_len, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out; + } + + ret = hclge_query_all_err_info(hdev, desc, bd_num); + if (ret) + goto err_desc; + + buf_len = bd_num * sizeof(struct hclge_desc) - HCLGE_DESC_NO_DATA_LEN; + buf_size = buf_len / sizeof(u32); + + desc_data = kzalloc(buf_len, GFP_KERNEL); + if (!desc_data) + return -ENOMEM; + + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) { + ret = -ENOMEM; + goto err_buf_alloc; + } + + memcpy(desc_data, &desc[0].data[0], buf_len); + for (i = 0; i < buf_size; i++) + buf[i] = le32_to_cpu(desc_data[i]); + + hclge_handle_error_module_log(ae_dev, buf, buf_size); + kfree(buf); + +err_buf_alloc: + kfree(desc_data); +err_desc: + kfree(desc); +out: + return ret; +} diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index d647f3c841345323f586be9075e4ce8a38276a5e..07987fb8332ef27b1b6f99b284dc297fe2d20ec6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -15,6 +15,8 @@ #define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00 #define HCLGE_RAS_REG_NFE_MASK 0xFF00 #define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000 +#define HCLGE_RAS_REG_ERR_MASK \ + (HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK) #define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00 @@ -107,6 +109,10 @@ #define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000 #define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F +#define HCLGE_DESC_DATA_MAX 8 +#define HCLGE_REG_NUM_MAX 256 +#define HCLGE_DESC_NO_DATA_LEN 8 + enum hclge_err_int_type { HCLGE_ERR_INT_MSIX = 0, HCLGE_ERR_INT_RAS_CE = 1, @@ -114,6 +120,56 @@ enum hclge_err_int_type { HCLGE_ERR_INT_RAS_FE = 3, }; +enum hclge_mod_name_list { + MODULE_NONE = 0, + MODULE_BIOS_COMMON = 1, + MODULE_GE = 2, + MODULE_IGU_EGU = 3, + MODULE_LGE = 4, + MODULE_NCSI = 5, + MODULE_PPP = 6, + MODULE_QCN = 7, + MODULE_RCB_RX = 8, + MODULE_RTC = 9, + MODULE_SSU = 10, + MODULE_TM = 11, + MODULE_RCB_TX = 12, + MODULE_TXDMA = 13, + MODULE_MASTER = 14, + /* add new MODULE NAME for NIC here in order */ + MODULE_ROCEE_TOP = 40, + MODULE_ROCEE_TIMER = 41, + MODULE_ROCEE_MDB = 42, + MODULE_ROCEE_TSP = 43, + MODULE_ROCEE_TRP = 44, + MODULE_ROCEE_SCC = 45, + MODULE_ROCEE_CAEP = 46, + MODULE_ROCEE_GEN_AC = 47, + MODULE_ROCEE_QMM = 48, + MODULE_ROCEE_LSAN = 49, + /* add new MODULE NAME for RoCEE here in order */ +}; + +enum hclge_err_type_list { + NONE_ERROR = 0, + FIFO_ERROR = 1, + MEMORY_ERROR = 2, + POISON_ERROR = 3, + MSIX_ECC_ERROR = 4, + TQP_INT_ECC_ERROR = 5, + PF_ABNORMAL_INT_ERROR = 6, + MPF_ABNORMAL_INT_ERROR = 7, + COMMON_ERROR = 8, + PORT_ERROR = 9, + ETS_ERROR = 10, + NCSI_ERROR = 11, + GLB_ERROR = 12, + /* add new ERROR TYPE for NIC here in order */ + ROCEE_NORMAL_ERR = 40, + ROCEE_OVF_ERR = 41, + /* add new ERROR TYPE for ROCEE here in order */ +}; + struct hclge_hw_blk { u32 msk; const char *name; @@ -126,11 +182,44 @@ struct hclge_hw_error { enum hnae3_reset_type reset_level; }; +struct hclge_hw_module_id { + enum hclge_mod_name_list module_id; + const char *msg; +}; + +struct hclge_hw_type_id { + enum hclge_err_type_list type_id; + const char *msg; +}; + +struct hclge_sum_err_info { + u8 reset_type; + u8 mod_num; + u8 rsv[2]; +}; + +struct hclge_mod_err_info { + u8 mod_id; + u8 err_num; + u8 rsv[2]; +}; + +struct hclge_type_reg_err_info { + u8 type_id; + u8 reg_num; + u8 rsv[2]; + u32 hclge_reg[HCLGE_REG_NUM_MAX]; +}; + int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en); int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state); int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en); void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev); +bool hclge_find_error_source(struct hclge_dev *hdev); +void hclge_handle_occurred_error(struct hclge_dev *hdev); pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev); int hclge_handle_hw_msix_error(struct hclge_dev *hdev, unsigned long *reset_requests); +int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev); +int hclge_handle_mac_tnl(struct hclge_dev *hdev); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 45102681bd2a18485c226a2b5c239b10b79ea7fb..d960e08850ae45042acaf1d65cd9493821c24256 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3307,11 +3307,13 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf, static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) { - u32 cmdq_src_reg, msix_src_reg; + u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg; /* fetch the events from their corresponding regs */ cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG); msix_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS); + hw_err_src_reg = hclge_read_dev(&hdev->hw, + HCLGE_RAS_PF_OTHER_INT_STS_REG); /* Assumption: If by any chance reset and mailbox events are reported * together then we will only process reset event in this go and will @@ -3339,11 +3341,10 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) return HCLGE_VECTOR0_EVENT_RST; } - /* check for vector0 msix event source */ - if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK) { - *clearval = msix_src_reg; + /* check for vector0 msix event and hardware error event source */ + if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK || + hw_err_src_reg & HCLGE_RAS_REG_ERR_MASK) return HCLGE_VECTOR0_EVENT_ERR; - } /* check for vector0 mailbox(=CMDQ RX) event source */ if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) { @@ -3354,9 +3355,8 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) /* print other vector0 event source */ dev_info(&hdev->pdev->dev, - "CMDQ INT status:0x%x, other INT status:0x%x\n", - cmdq_src_reg, msix_src_reg); - *clearval = msix_src_reg; + "INT status: CMDQ(%#x) HW errors(%#x) other(%#x)\n", + cmdq_src_reg, hw_err_src_reg, msix_src_reg); return HCLGE_VECTOR0_EVENT_OTHER; } @@ -3427,15 +3427,10 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data) hclge_clear_event_cause(hdev, event_cause, clearval); - /* Enable interrupt if it is not cause by reset. And when - * clearval equal to 0, it means interrupt status may be - * cleared by hardware before driver reads status register. - * For this case, vector0 interrupt also should be enabled. - */ - if (!clearval || - event_cause == HCLGE_VECTOR0_EVENT_MBX) { + /* Enable interrupt if it is not caused by reset event or error event */ + if (event_cause == HCLGE_VECTOR0_EVENT_MBX || + event_cause == HCLGE_VECTOR0_EVENT_OTHER) hclge_enable_vector(&hdev->misc_vector, true); - } return IRQ_HANDLED; } @@ -4240,6 +4235,38 @@ static void hclge_reset_subtask(struct hclge_dev *hdev) hdev->reset_type = HNAE3_NONE_RESET; } +static void hclge_handle_err_reset_request(struct hclge_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + enum hnae3_reset_type reset_type; + + if (ae_dev->hw_err_reset_req) { + reset_type = hclge_get_reset_level(ae_dev, + &ae_dev->hw_err_reset_req); + hclge_set_def_reset_request(ae_dev, reset_type); + } + + if (hdev->default_reset_request && ae_dev->ops->reset_event) + ae_dev->ops->reset_event(hdev->pdev, NULL); + + /* enable interrupt after error handling complete */ + hclge_enable_vector(&hdev->misc_vector, true); +} + +static void hclge_handle_err_recovery(struct hclge_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + + ae_dev->hw_err_reset_req = 0; + + if (hclge_find_error_source(hdev)) { + hclge_handle_error_info_log(ae_dev); + hclge_handle_mac_tnl(hdev); + } + + hclge_handle_err_reset_request(hdev); +} + static void hclge_misc_err_recovery(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); @@ -4247,19 +4274,16 @@ static void hclge_misc_err_recovery(struct hclge_dev *hdev) u32 msix_sts_reg; msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS); - if (msix_sts_reg & HCLGE_VECTOR0_REG_MSIX_MASK) { - if (hclge_handle_hw_msix_error(hdev, - &hdev->default_reset_request)) + if (hclge_handle_hw_msix_error + (hdev, &hdev->default_reset_request)) dev_info(dev, "received msix interrupt 0x%x\n", msix_sts_reg); - - if (hdev->default_reset_request) - if (ae_dev->ops->reset_event) - ae_dev->ops->reset_event(hdev->pdev, NULL); } - hclge_enable_vector(&hdev->misc_vector, true); + hclge_handle_hw_ras_error(ae_dev); + + hclge_handle_err_reset_request(hdev); } static void hclge_errhand_service_task(struct hclge_dev *hdev) @@ -4267,7 +4291,10 @@ static void hclge_errhand_service_task(struct hclge_dev *hdev) if (!test_and_clear_bit(HCLGE_STATE_ERR_SERVICE_SCHED, &hdev->state)) return; - hclge_misc_err_recovery(hdev); + if (hnae3_dev_ras_imp_supported(hdev)) + hclge_handle_err_recovery(hdev); + else + hclge_misc_err_recovery(hdev); } static void hclge_reset_service_task(struct hclge_dev *hdev) @@ -11524,7 +11551,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_clear_resetting_state(hdev); /* Log and clear the hw errors those already occurred */ - hclge_handle_all_hns_hw_errors(ae_dev); + if (hnae3_dev_ras_imp_supported(hdev)) + hclge_handle_occurred_error(hdev); + else + hclge_handle_all_hns_hw_errors(ae_dev); /* request delayed reset for the error recovery because an immediate * global reset on a PF affecting pending initialization of other PFs @@ -11877,7 +11907,10 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev) } /* Log and clear the hw errors those already occurred */ - hclge_handle_all_hns_hw_errors(ae_dev); + if (hnae3_dev_ras_imp_supported(hdev)) + hclge_handle_occurred_error(hdev); + else + hclge_handle_all_hns_hw_errors(ae_dev); /* Re-enable the hw error interrupts because * the interrupts get disabled on global reset. diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 9b8abb5d7a8ee2447fa1aefc490a52c0fa5a91fc..582972a6f60e0ed64824db70f333763e74f88233 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -190,6 +190,7 @@ enum HLCGE_PORT_TYPE { #define HCLGE_VECTOR0_IMP_RESET_INT_B 1 #define HCLGE_VECTOR0_IMP_CMDQ_ERR_B 4U #define HCLGE_VECTOR0_IMP_RD_POISON_B 5U +#define HCLGE_VECTOR0_ALL_MSIX_ERR_B 6U #define HCLGE_MAC_DEFAULT_FRAME \ (ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN)