From bae8eab0e13c2167c751d76a003650012be4e90c Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Thu, 11 Jul 2019 16:12:49 +0800 Subject: [PATCH] net: hns3: add support for handling IMP error driver inclusion category: bugfix bugzilla: NA CVE: NA When IMP goes errors, the hardware reports a RAS to the driver, the driver record this kind of error. Then a IMP reset will happen, the driver checks the reason and takes the corresponding action when doing IMP reset. So this patch adds imp_err_state field to the struct hclge_dev to record the error type, and handle_imp_error ops to handle it. Feature or Bugfix: Bugfix Signed-off-by: Weihang Li Signed-off-by: Huazhong Tan Reviewed-by: lipeng Reviewed-by: Yang Yingliang Signed-off-by: Yang Yingliang --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 + .../hisilicon/hns3/hns3pf/hclge_err.c | 40 ++++++++++++++++--- .../hisilicon/hns3/hns3pf/hclge_err.h | 4 ++ .../hisilicon/hns3/hns3pf/hclge_main.c | 29 ++++++++++++++ .../hisilicon/hns3/hns3pf/hclge_main.h | 9 +++++ 5 files changed, 78 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 45ab998ae55f..8fc488e6db01 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -525,6 +525,7 @@ struct hnae3_ae_ops { void (*mac_disconnect_phy)(struct hnae3_handle *handle); bool (*reset_done)(struct hnae3_handle *handle, bool done); void (*restore_vlan_table)(struct hnae3_handle *handle); + void (*handle_imp_error)(struct hnae3_handle *handle); #ifdef CONFIG_HNS3_TEST int (*send_cmdq)(struct hnae3_handle *handle, void *data, int num); int (*test_cmdq)(struct hnae3_handle *handle, void *data, int *len); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index 9cdefedbfc72..32d27803b281 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -704,6 +704,28 @@ static int hclge_cmd_query_error(struct hclge_dev *hdev, return ret; } +static int hclge_check_imp_poison_err(struct hclge_dev *hdev) +{ + struct device *dev = &hdev->pdev->dev; + int ras_status; + int ret = false; + + ras_status = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG); + if (ras_status & HCLGE_RAS_IMP_RD_POISON_MASK) { + set_bit(HCLGE_IMP_RD_POISON, &hdev->imp_err_state); + /* This error will be handle by IMP reset */ + dev_info(dev, "IMP RD poison detected!\n"); + ret = true; + } else if (ras_status & HCLGE_RAS_IMP_CMDQ_ERR_MASK) { + set_bit(HCLGE_IMP_CMDQ_ERROR, &hdev->imp_err_state); + /* This error will be handle by IMP reset */ + dev_info(dev, "IMP CMDQ error detected!\n"); + ret = true; + } + + return ret; +} + static int hclge_clear_mac_tnl_int(struct hclge_dev *hdev) { struct hclge_desc desc; @@ -1120,9 +1142,6 @@ static int hclge_handle_mpf_ras_error(struct hclge_dev *hdev, &hclge_cmdq_nic_mem_ecc_int[0], status, &ae_dev->hw_err_reset_req); - if ((le32_to_cpu(desc[0].data[2])) & BIT(0)) - dev_warn(dev, "imp_rd_data_poison_err found\n"); - status = le32_to_cpu(desc[0].data[3]); if (status) hclge_log_error(dev, "TQP_INT_ECC_INT_STS", @@ -1295,10 +1314,12 @@ static int hclge_handle_pf_ras_error(struct hclge_dev *hdev, /* log PPU(RCB) errors */ desc_data = (__le32 *)&desc[3]; status = le32_to_cpu(*desc_data) & HCLGE_PPU_PF_INT_RAS_MASK; - if (status) + if (status) { hclge_log_error(dev, "PPU_PF_ABNORMAL_INT_ST0", &hclge_ppu_pf_abnormal_int[0], status, &ae_dev->hw_err_reset_req); + hdev->ppu_poison_ras_err = true; + } /* clear all PF RAS errors */ hclge_cmd_reuse_desc(&desc[0], false); @@ -1603,6 +1624,7 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) struct hclge_dev *hdev = ae_dev->priv; struct device *dev = &hdev->pdev->dev; u32 status; + int ret; if (!test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state)) { dev_err(dev, @@ -1610,6 +1632,9 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) return PCI_ERS_RESULT_NONE; } + if (hclge_check_imp_poison_err(hdev)) + return PCI_ERS_RESULT_RECOVERED; + status = hclge_read_dev(&hdev->hw, HCLGE_RAS_PF_OTHER_INT_STS_REG); if (status & HCLGE_RAS_REG_NFE_MASK || @@ -1621,7 +1646,12 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) dev_warn(dev, "HNS Non-Fatal RAS error(status=0x%x) identified\n", status); - hclge_handle_all_ras_errors(hdev); + ret = hclge_handle_all_ras_errors(hdev); + if (ret) { + ret = hclge_check_imp_poison_err(hdev); + if (ret) + return PCI_ERS_RESULT_RECOVERED; + } } else { if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) || hdev->pdev->revision < 0x21) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index 14443c0f628c..94ccb899274f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -7,6 +7,10 @@ #include "hclge_main.h" #define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00 +#define HCLGE_RAS_IMP_RD_POISON_MASK \ + BIT(HCLGE_VECTOR0_IMP_RD_POISON_B) +#define HCLGE_RAS_IMP_CMDQ_ERR_MASK \ + BIT(HCLGE_VECTOR0_IMP_CMDQ_ERR_B) #define HCLGE_RAS_REG_NFE_MASK 0xFF00 #define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index f4a140ea5752..b5db305161be 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3365,6 +3365,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) { #define HCLGE_RESET_SYNC_TIME 100 + struct hnae3_handle *handle = handle = &hdev->vport[0].nic; u32 reg_val; int ret = 0; @@ -3399,6 +3400,8 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) hdev->rst_stats.flr_rst_cnt++; break; case HNAE3_IMP_RESET: + if (handle && handle->ae_algo->ops->handle_imp_error) + handle->ae_algo->ops->handle_imp_error(handle); reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG); hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, BIT(HCLGE_VECTOR0_IMP_RESET_INT_B) | reg_val); @@ -3638,6 +3641,9 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle) else if (time_after(jiffies, (hdev->last_reset_time + 4 * 5 * HZ))) hdev->reset_level = HNAE3_FUNC_RESET; + if (hdev->ppu_poison_ras_err) + hdev->ppu_poison_ras_err = false; + dev_info(&hdev->pdev->dev, "received reset event , reset type is %d", hdev->reset_level); @@ -3677,6 +3683,27 @@ bool hclge_reset_done(struct hnae3_handle *handle, bool done) return done; } +void hclge_handle_imp_error(struct hnae3_handle *handle) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + u32 reg_val; + + if (test_and_clear_bit(HCLGE_IMP_RD_POISON, &hdev->imp_err_state)) { + dev_err(&hdev->pdev->dev, "Detected IMP RD poison!\n"); + reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG) & + ~BIT(HCLGE_VECTOR0_IMP_RD_POISON_B); + hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val); + } + + if (test_and_clear_bit(HCLGE_IMP_CMDQ_ERROR, &hdev->imp_err_state)) { + dev_err(&hdev->pdev->dev, "Detected IMP CMDQ error!\n"); + reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG) & + ~BIT(HCLGE_VECTOR0_IMP_CMDQ_ERR_B); + hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val); + } +} + static void hclge_reset_subtask(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); @@ -3692,6 +3719,7 @@ static void hclge_reset_subtask(struct hclge_dev *hdev) */ hdev->last_reset_time = jiffies; hdev->reset_type = hclge_get_reset_level(ae_dev, &hdev->reset_pending); + if (hdev->reset_type != HNAE3_NONE_RESET) hclge_reset(hdev); @@ -10050,6 +10078,7 @@ struct hnae3_ae_ops hclge_ops = { .mac_disconnect_phy = hclge_mac_disconnect_phy, .restore_vlan_table = hclge_restore_vlan_table, .reset_done = hclge_reset_done, + .handle_imp_error = hclge_handle_imp_error, }; struct hnae3_ae_algo ae_algo = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 0c260fe079cd..51edc5104164 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -181,6 +181,8 @@ enum HLCGE_PORT_TYPE { #define HCLGE_VECTOR0_RX_CMDQ_INT_B 1 #define HCLGE_VECTOR0_IMP_RESET_INT_B 1 +#define HCLGE_VECTOR0_IMP_RD_POISON_B 4 +#define HCLGE_VECTOR0_IMP_CMDQ_ERR_B 5 #define HCLGE_MAC_DEFAULT_FRAME \ (ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN) @@ -687,6 +689,11 @@ enum HCLGE_MAC_ADDR_TYPE { HCLGE_MAC_ADDR_MC }; +enum HCLGE_IMP_ERR_TYPE { + HCLGE_IMP_RD_POISON, + HCLGE_IMP_CMDQ_ERROR, +}; + struct hclge_vport_vlan_cfg { struct list_head node; int hd_tbl_status; @@ -776,6 +783,8 @@ struct hclge_dev { u8 tc_num_last_time; enum hclge_fc_mode fc_mode_last_time; bool support_sfp_query; + bool ppu_poison_ras_err; + unsigned long imp_err_state; #define HCLGE_FLAG_TC_BASE_SCH_MODE 1 #define HCLGE_FLAG_VNET_BASE_SCH_MODE 2 -- GitLab