提交 bae8eab0 编写于 作者: W Weihang Li 提交者: Xie XiuQi

net: hns3: add support for handling IMP error

driver inclusion
category: bugfix
bugzilla: NA
CVE: NA

When IMP goes errors, the hardware reports a RAS to the driver,
the driver record this kind of error. Then a IMP reset will happen,
the driver checks the reason and takes the corresponding action
when doing IMP reset.

So this patch adds imp_err_state field to the struct hclge_dev
to record the error type, and handle_imp_error ops to handle it.

Feature or Bugfix: Bugfix
Signed-off-by: NWeihang Li <liweihang@hisilicon.com>
Signed-off-by: NHuazhong Tan <tanhuazhong@huawei.com>
Reviewed-by: Nlipeng <lipeng321@huawei.com>
Reviewed-by: NYang Yingliang <yangyingliang@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 90429e50
...@@ -525,6 +525,7 @@ struct hnae3_ae_ops { ...@@ -525,6 +525,7 @@ struct hnae3_ae_ops {
void (*mac_disconnect_phy)(struct hnae3_handle *handle); void (*mac_disconnect_phy)(struct hnae3_handle *handle);
bool (*reset_done)(struct hnae3_handle *handle, bool done); bool (*reset_done)(struct hnae3_handle *handle, bool done);
void (*restore_vlan_table)(struct hnae3_handle *handle); void (*restore_vlan_table)(struct hnae3_handle *handle);
void (*handle_imp_error)(struct hnae3_handle *handle);
#ifdef CONFIG_HNS3_TEST #ifdef CONFIG_HNS3_TEST
int (*send_cmdq)(struct hnae3_handle *handle, void *data, int num); int (*send_cmdq)(struct hnae3_handle *handle, void *data, int num);
int (*test_cmdq)(struct hnae3_handle *handle, void *data, int *len); int (*test_cmdq)(struct hnae3_handle *handle, void *data, int *len);
......
...@@ -704,6 +704,28 @@ static int hclge_cmd_query_error(struct hclge_dev *hdev, ...@@ -704,6 +704,28 @@ static int hclge_cmd_query_error(struct hclge_dev *hdev,
return ret; return ret;
} }
static int hclge_check_imp_poison_err(struct hclge_dev *hdev)
{
struct device *dev = &hdev->pdev->dev;
int ras_status;
int ret = false;
ras_status = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG);
if (ras_status & HCLGE_RAS_IMP_RD_POISON_MASK) {
set_bit(HCLGE_IMP_RD_POISON, &hdev->imp_err_state);
/* This error will be handle by IMP reset */
dev_info(dev, "IMP RD poison detected!\n");
ret = true;
} else if (ras_status & HCLGE_RAS_IMP_CMDQ_ERR_MASK) {
set_bit(HCLGE_IMP_CMDQ_ERROR, &hdev->imp_err_state);
/* This error will be handle by IMP reset */
dev_info(dev, "IMP CMDQ error detected!\n");
ret = true;
}
return ret;
}
static int hclge_clear_mac_tnl_int(struct hclge_dev *hdev) static int hclge_clear_mac_tnl_int(struct hclge_dev *hdev)
{ {
struct hclge_desc desc; struct hclge_desc desc;
...@@ -1120,9 +1142,6 @@ static int hclge_handle_mpf_ras_error(struct hclge_dev *hdev, ...@@ -1120,9 +1142,6 @@ static int hclge_handle_mpf_ras_error(struct hclge_dev *hdev,
&hclge_cmdq_nic_mem_ecc_int[0], status, &hclge_cmdq_nic_mem_ecc_int[0], status,
&ae_dev->hw_err_reset_req); &ae_dev->hw_err_reset_req);
if ((le32_to_cpu(desc[0].data[2])) & BIT(0))
dev_warn(dev, "imp_rd_data_poison_err found\n");
status = le32_to_cpu(desc[0].data[3]); status = le32_to_cpu(desc[0].data[3]);
if (status) if (status)
hclge_log_error(dev, "TQP_INT_ECC_INT_STS", hclge_log_error(dev, "TQP_INT_ECC_INT_STS",
...@@ -1295,10 +1314,12 @@ static int hclge_handle_pf_ras_error(struct hclge_dev *hdev, ...@@ -1295,10 +1314,12 @@ static int hclge_handle_pf_ras_error(struct hclge_dev *hdev,
/* log PPU(RCB) errors */ /* log PPU(RCB) errors */
desc_data = (__le32 *)&desc[3]; desc_data = (__le32 *)&desc[3];
status = le32_to_cpu(*desc_data) & HCLGE_PPU_PF_INT_RAS_MASK; status = le32_to_cpu(*desc_data) & HCLGE_PPU_PF_INT_RAS_MASK;
if (status) if (status) {
hclge_log_error(dev, "PPU_PF_ABNORMAL_INT_ST0", hclge_log_error(dev, "PPU_PF_ABNORMAL_INT_ST0",
&hclge_ppu_pf_abnormal_int[0], status, &hclge_ppu_pf_abnormal_int[0], status,
&ae_dev->hw_err_reset_req); &ae_dev->hw_err_reset_req);
hdev->ppu_poison_ras_err = true;
}
/* clear all PF RAS errors */ /* clear all PF RAS errors */
hclge_cmd_reuse_desc(&desc[0], false); hclge_cmd_reuse_desc(&desc[0], false);
...@@ -1603,6 +1624,7 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) ...@@ -1603,6 +1624,7 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
struct hclge_dev *hdev = ae_dev->priv; struct hclge_dev *hdev = ae_dev->priv;
struct device *dev = &hdev->pdev->dev; struct device *dev = &hdev->pdev->dev;
u32 status; u32 status;
int ret;
if (!test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state)) { if (!test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state)) {
dev_err(dev, dev_err(dev,
...@@ -1610,6 +1632,9 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) ...@@ -1610,6 +1632,9 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
return PCI_ERS_RESULT_NONE; return PCI_ERS_RESULT_NONE;
} }
if (hclge_check_imp_poison_err(hdev))
return PCI_ERS_RESULT_RECOVERED;
status = hclge_read_dev(&hdev->hw, HCLGE_RAS_PF_OTHER_INT_STS_REG); status = hclge_read_dev(&hdev->hw, HCLGE_RAS_PF_OTHER_INT_STS_REG);
if (status & HCLGE_RAS_REG_NFE_MASK || if (status & HCLGE_RAS_REG_NFE_MASK ||
...@@ -1621,7 +1646,12 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) ...@@ -1621,7 +1646,12 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
dev_warn(dev, dev_warn(dev,
"HNS Non-Fatal RAS error(status=0x%x) identified\n", "HNS Non-Fatal RAS error(status=0x%x) identified\n",
status); status);
hclge_handle_all_ras_errors(hdev); ret = hclge_handle_all_ras_errors(hdev);
if (ret) {
ret = hclge_check_imp_poison_err(hdev);
if (ret)
return PCI_ERS_RESULT_RECOVERED;
}
} else { } else {
if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) || if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
hdev->pdev->revision < 0x21) { hdev->pdev->revision < 0x21) {
......
...@@ -7,6 +7,10 @@ ...@@ -7,6 +7,10 @@
#include "hclge_main.h" #include "hclge_main.h"
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00 #define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
#define HCLGE_RAS_IMP_RD_POISON_MASK \
BIT(HCLGE_VECTOR0_IMP_RD_POISON_B)
#define HCLGE_RAS_IMP_CMDQ_ERR_MASK \
BIT(HCLGE_VECTOR0_IMP_CMDQ_ERR_B)
#define HCLGE_RAS_REG_NFE_MASK 0xFF00 #define HCLGE_RAS_REG_NFE_MASK 0xFF00
#define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000 #define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000
......
...@@ -3365,6 +3365,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) ...@@ -3365,6 +3365,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev)
{ {
#define HCLGE_RESET_SYNC_TIME 100 #define HCLGE_RESET_SYNC_TIME 100
struct hnae3_handle *handle = handle = &hdev->vport[0].nic;
u32 reg_val; u32 reg_val;
int ret = 0; int ret = 0;
...@@ -3399,6 +3400,8 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) ...@@ -3399,6 +3400,8 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev)
hdev->rst_stats.flr_rst_cnt++; hdev->rst_stats.flr_rst_cnt++;
break; break;
case HNAE3_IMP_RESET: case HNAE3_IMP_RESET:
if (handle && handle->ae_algo->ops->handle_imp_error)
handle->ae_algo->ops->handle_imp_error(handle);
reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG); reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG);
hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG,
BIT(HCLGE_VECTOR0_IMP_RESET_INT_B) | reg_val); BIT(HCLGE_VECTOR0_IMP_RESET_INT_B) | reg_val);
...@@ -3638,6 +3641,9 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle) ...@@ -3638,6 +3641,9 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle)
else if (time_after(jiffies, (hdev->last_reset_time + 4 * 5 * HZ))) else if (time_after(jiffies, (hdev->last_reset_time + 4 * 5 * HZ)))
hdev->reset_level = HNAE3_FUNC_RESET; hdev->reset_level = HNAE3_FUNC_RESET;
if (hdev->ppu_poison_ras_err)
hdev->ppu_poison_ras_err = false;
dev_info(&hdev->pdev->dev, "received reset event , reset type is %d", dev_info(&hdev->pdev->dev, "received reset event , reset type is %d",
hdev->reset_level); hdev->reset_level);
...@@ -3677,6 +3683,27 @@ bool hclge_reset_done(struct hnae3_handle *handle, bool done) ...@@ -3677,6 +3683,27 @@ bool hclge_reset_done(struct hnae3_handle *handle, bool done)
return done; return done;
} }
void hclge_handle_imp_error(struct hnae3_handle *handle)
{
struct hclge_vport *vport = hclge_get_vport(handle);
struct hclge_dev *hdev = vport->back;
u32 reg_val;
if (test_and_clear_bit(HCLGE_IMP_RD_POISON, &hdev->imp_err_state)) {
dev_err(&hdev->pdev->dev, "Detected IMP RD poison!\n");
reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG) &
~BIT(HCLGE_VECTOR0_IMP_RD_POISON_B);
hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val);
}
if (test_and_clear_bit(HCLGE_IMP_CMDQ_ERROR, &hdev->imp_err_state)) {
dev_err(&hdev->pdev->dev, "Detected IMP CMDQ error!\n");
reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG) &
~BIT(HCLGE_VECTOR0_IMP_CMDQ_ERR_B);
hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val);
}
}
static void hclge_reset_subtask(struct hclge_dev *hdev) static void hclge_reset_subtask(struct hclge_dev *hdev)
{ {
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
...@@ -3692,6 +3719,7 @@ static void hclge_reset_subtask(struct hclge_dev *hdev) ...@@ -3692,6 +3719,7 @@ static void hclge_reset_subtask(struct hclge_dev *hdev)
*/ */
hdev->last_reset_time = jiffies; hdev->last_reset_time = jiffies;
hdev->reset_type = hclge_get_reset_level(ae_dev, &hdev->reset_pending); hdev->reset_type = hclge_get_reset_level(ae_dev, &hdev->reset_pending);
if (hdev->reset_type != HNAE3_NONE_RESET) if (hdev->reset_type != HNAE3_NONE_RESET)
hclge_reset(hdev); hclge_reset(hdev);
...@@ -10050,6 +10078,7 @@ struct hnae3_ae_ops hclge_ops = { ...@@ -10050,6 +10078,7 @@ struct hnae3_ae_ops hclge_ops = {
.mac_disconnect_phy = hclge_mac_disconnect_phy, .mac_disconnect_phy = hclge_mac_disconnect_phy,
.restore_vlan_table = hclge_restore_vlan_table, .restore_vlan_table = hclge_restore_vlan_table,
.reset_done = hclge_reset_done, .reset_done = hclge_reset_done,
.handle_imp_error = hclge_handle_imp_error,
}; };
struct hnae3_ae_algo ae_algo = { struct hnae3_ae_algo ae_algo = {
......
...@@ -181,6 +181,8 @@ enum HLCGE_PORT_TYPE { ...@@ -181,6 +181,8 @@ enum HLCGE_PORT_TYPE {
#define HCLGE_VECTOR0_RX_CMDQ_INT_B 1 #define HCLGE_VECTOR0_RX_CMDQ_INT_B 1
#define HCLGE_VECTOR0_IMP_RESET_INT_B 1 #define HCLGE_VECTOR0_IMP_RESET_INT_B 1
#define HCLGE_VECTOR0_IMP_RD_POISON_B 4
#define HCLGE_VECTOR0_IMP_CMDQ_ERR_B 5
#define HCLGE_MAC_DEFAULT_FRAME \ #define HCLGE_MAC_DEFAULT_FRAME \
(ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN) (ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN)
...@@ -687,6 +689,11 @@ enum HCLGE_MAC_ADDR_TYPE { ...@@ -687,6 +689,11 @@ enum HCLGE_MAC_ADDR_TYPE {
HCLGE_MAC_ADDR_MC HCLGE_MAC_ADDR_MC
}; };
enum HCLGE_IMP_ERR_TYPE {
HCLGE_IMP_RD_POISON,
HCLGE_IMP_CMDQ_ERROR,
};
struct hclge_vport_vlan_cfg { struct hclge_vport_vlan_cfg {
struct list_head node; struct list_head node;
int hd_tbl_status; int hd_tbl_status;
...@@ -776,6 +783,8 @@ struct hclge_dev { ...@@ -776,6 +783,8 @@ struct hclge_dev {
u8 tc_num_last_time; u8 tc_num_last_time;
enum hclge_fc_mode fc_mode_last_time; enum hclge_fc_mode fc_mode_last_time;
bool support_sfp_query; bool support_sfp_query;
bool ppu_poison_ras_err;
unsigned long imp_err_state;
#define HCLGE_FLAG_TC_BASE_SCH_MODE 1 #define HCLGE_FLAG_TC_BASE_SCH_MODE 1
#define HCLGE_FLAG_VNET_BASE_SCH_MODE 2 #define HCLGE_FLAG_VNET_BASE_SCH_MODE 2
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册