提交 6afa66ae 编写于 作者: S Shiju Jose 提交者: Xie XiuQi

net: hns3: process H/W errors occurred before HNS dev initialization

driver inclusion
category: bugfix
bugzilla: NA
CVE: NA

Presently the HNS driver enables the HNS H/W error interrupts after
the dev initialization is completed. However some exceptions such as
NCSI errors can occur when the network port driver is not loaded
and those errors required reporting to the BMC.
Therefore the firmware enabled all the HNS hw error interrupts
before the driver is loaded. Thus the HNS driver needs to process
and recover those hw errors occurred before HNS driver is initialized.

This patch adds processing of the HNS hw errors(RAS and MSI-X)
which occurred before the driver initialization.

Feature or Bugfix:Feature
Signed-off-by: NShiju Jose <shiju.jose@huawei.com>
Signed-off-by: NWeihang Li <liweihang@hisilicon.com>
Signed-off-by: NShiju Jose <shiju.jose@huawei.com>
Signed-off-by: Nliweihang <liweihang@huawei.com>
Reviewed-by: Nshenjian <shenjian15@huawei.com>
Reviewed-by: NXie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 f166c573
...@@ -1355,49 +1355,6 @@ int hclge_handle_all_ras_errors(struct hclge_dev *hdev) ...@@ -1355,49 +1355,6 @@ int hclge_handle_all_ras_errors(struct hclge_dev *hdev)
return ret; return ret;
} }
int hclge_clear_all_ras_errors(struct hclge_dev *hdev)
{
struct hclge_bd_num bd_num;
struct hclge_desc *desc;
int ret;
/* query the number of registers in the RAS int status */
desc = hclge_query_bd_num(hdev, &bd_num,
HCLGE_QUERY_RAS_INT_STS_BD_NUM);
if (!desc)
return -ENOMEM;
/* query all main PF RAS errors */
ret = hclge_query_error(hdev, desc, HCLGE_QUERY_CLEAR_MPF_RAS_INT,
bd_num.mpf_bd_num);
if (ret) {
kfree(desc);
return ret;
}
/* clear all main PF RAS errors */
ret = hclge_clear_error(hdev, desc, bd_num.mpf_bd_num);
if (ret) {
kfree(desc);
return ret;
}
memset(desc, 0, bd_num.max_bd_num * sizeof(struct hclge_desc));
/* query all PF RAS errors */
ret = hclge_query_error(hdev, desc, HCLGE_QUERY_CLEAR_PF_RAS_INT,
bd_num.pf_bd_num);
if (ret) {
kfree(desc);
return ret;
}
/* clear all PF RAS errors */
ret = hclge_clear_error(hdev, desc, bd_num.pf_bd_num);
kfree(desc);
return ret;
}
static int hclge_log_rocee_ovf_error(struct hclge_dev *hdev) static int hclge_log_rocee_ovf_error(struct hclge_dev *hdev)
{ {
struct device *dev = &hdev->pdev->dev; struct device *dev = &hdev->pdev->dev;
...@@ -1501,7 +1458,7 @@ hclge_log_and_clear_rocee_ras_error(struct hclge_dev *hdev) ...@@ -1501,7 +1458,7 @@ hclge_log_and_clear_rocee_ras_error(struct hclge_dev *hdev)
return reset_type; return reset_type;
} }
static int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en) int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en)
{ {
struct device *dev = &hdev->pdev->dev; struct device *dev = &hdev->pdev->dev;
struct hclge_desc desc; struct hclge_desc desc;
...@@ -1576,10 +1533,9 @@ static const struct hclge_hw_blk hw_blk[] = { ...@@ -1576,10 +1533,9 @@ static const struct hclge_hw_blk hw_blk[] = {
{ /* sentinel */ } { /* sentinel */ }
}; };
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state) int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state)
{ {
const struct hclge_hw_blk *module = hw_blk; const struct hclge_hw_blk *module = hw_blk;
struct device *dev = &hdev->pdev->dev;
int ret = 0; int ret = 0;
while (module->name) { while (module->name) {
...@@ -1591,10 +1547,6 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state) ...@@ -1591,10 +1547,6 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
module++; module++;
} }
ret = hclge_config_rocee_ras_interrupt(hdev, state);
if (ret)
dev_err(dev, "fail(%d) to configure ROCEE err int\n", ret);
return ret; return ret;
} }
...@@ -1604,6 +1556,12 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) ...@@ -1604,6 +1556,12 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
struct device *dev = &hdev->pdev->dev; struct device *dev = &hdev->pdev->dev;
u32 status; u32 status;
if (!test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state)) {
dev_err(dev,
"Can't recover - RAS error reported during dev init\n");
return PCI_ERS_RESULT_NONE;
}
status = hclge_read_dev(&hdev->hw, HCLGE_RAS_PF_OTHER_INT_STS_REG); status = hclge_read_dev(&hdev->hw, HCLGE_RAS_PF_OTHER_INT_STS_REG);
if (status & HCLGE_RAS_REG_NFE_MASK || if (status & HCLGE_RAS_REG_NFE_MASK ||
...@@ -1640,7 +1598,7 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) ...@@ -1640,7 +1598,7 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
return PCI_ERS_RESULT_RECOVERED; return PCI_ERS_RESULT_RECOVERED;
} }
int hclge_handle_hw_msix_error(struct hclge_dev *hdev, static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev,
unsigned long *reset_requests) unsigned long *reset_requests)
{ {
struct hclge_mac_tnl_stats mac_tnl_stats; struct hclge_mac_tnl_stats mac_tnl_stats;
...@@ -1781,3 +1739,41 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev, ...@@ -1781,3 +1739,41 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
out: out:
return ret; return ret;
} }
int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
unsigned long *reset_requests)
{
struct device *dev = &hdev->pdev->dev;
if (!test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state)) {
dev_err(dev,
"Can't handle - MSIx error reported during dev init\n");
return 0;
}
return hclge_handle_all_hw_msix_error(hdev, reset_requests);
}
void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
{
struct hclge_dev *hdev = ae_dev->priv;
struct device *dev = &hdev->pdev->dev;
u32 status;
ae_dev->hw_err_reset_req = 0;
status = hclge_read_dev(&hdev->hw, HCLGE_RAS_PF_OTHER_INT_STS_REG);
/* Handle Non-fatal HNS RAS errors */
if (status & HCLGE_RAS_REG_NFE_MASK) {
dev_warn(dev, "HNS hw error(RAS) identified during init\n");
hclge_handle_all_ras_errors(hdev);
}
/* Handle HNS hw errors reported through msix */
status = hclge_read_dev(&hdev->hw,
HCLGE_VECTOR0_PF_OTHER_INT_STS_REG);
if (status & HCLGE_VECTOR0_REG_MSIX_MASK) {
dev_warn(dev, "HNS hw error(MSIx) identified during init\n");
hclge_handle_all_hw_msix_error(hdev, &ae_dev->hw_err_reset_req);
}
}
...@@ -150,8 +150,9 @@ extern const struct hclge_hw_error hclge_ssu_port_based_pf_int[]; ...@@ -150,8 +150,9 @@ extern const struct hclge_hw_error hclge_ssu_port_based_pf_int[];
extern const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[]; extern const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[];
int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en); int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en);
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state); int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state);
int hclge_clear_all_ras_errors(struct hclge_dev *hdev); int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en);
void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev);
pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev); pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
int hclge_handle_hw_msix_error(struct hclge_dev *hdev, int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
unsigned long *reset_requests); unsigned long *reset_requests);
......
...@@ -8307,10 +8307,16 @@ static int hclge_init_nic_client_instance(struct hnae3_ae_dev *ae_dev, ...@@ -8307,10 +8307,16 @@ static int hclge_init_nic_client_instance(struct hnae3_ae_dev *ae_dev,
hnae3_set_client_init_flag(client, ae_dev, 1); hnae3_set_client_init_flag(client, ae_dev, 1);
/* Enable nic hw error interrupts */
ret = hclge_config_nic_hw_error(hdev, true);
if (ret)
dev_err(&ae_dev->pdev->dev,
"fail(%d) to enable hw error interrupts\n", ret);
if (netif_msg_drv(&hdev->vport->nic)) if (netif_msg_drv(&hdev->vport->nic))
hclge_info_show(hdev); hclge_info_show(hdev);
return 0; return ret;
} }
static int hclge_init_roce_client_instance(struct hnae3_ae_dev *ae_dev, static int hclge_init_roce_client_instance(struct hnae3_ae_dev *ae_dev,
...@@ -8334,7 +8340,13 @@ static int hclge_init_roce_client_instance(struct hnae3_ae_dev *ae_dev, ...@@ -8334,7 +8340,13 @@ static int hclge_init_roce_client_instance(struct hnae3_ae_dev *ae_dev,
hnae3_set_client_init_flag(client, ae_dev, 1); hnae3_set_client_init_flag(client, ae_dev, 1);
return 0; /* Enable roce ras interrupts */
ret = hclge_config_rocee_ras_interrupt(hdev, true);
if (ret)
dev_err(&ae_dev->pdev->dev,
"fail(%d) to enable roce ras interrupts\n", ret);
return ret;
} }
static int hclge_init_client_instance(struct hnae3_client *client, static int hclge_init_client_instance(struct hnae3_client *client,
...@@ -8686,20 +8698,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) ...@@ -8686,20 +8698,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
goto err_mdiobus_unreg; goto err_mdiobus_unreg;
} }
ret = hclge_hw_error_set_state(hdev, true);
if (ret) {
dev_err(&pdev->dev,
"fail(%d) to enable hw error interrupts\n", ret);
goto err_mdiobus_unreg;
}
ret = hclge_clear_all_ras_errors(hdev);
if (ret) {
dev_err(&pdev->dev,
"fail(%d) to clear all ras states\n", ret);
goto err_mdiobus_unreg;
}
INIT_KFIFO(hdev->mac_tnl_log); INIT_KFIFO(hdev->mac_tnl_log);
hclge_dcb_ops_set(hdev); hclge_dcb_ops_set(hdev);
...@@ -8712,6 +8710,9 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) ...@@ -8712,6 +8710,9 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
hclge_clear_all_event_cause(hdev); hclge_clear_all_event_cause(hdev);
/* Log and clear the hw errors those already occurred */
hclge_handle_all_hns_hw_errors(ae_dev);
/* Enable MISC vector(vector0) */ /* Enable MISC vector(vector0) */
hclge_enable_vector(&hdev->misc_vector, true); hclge_enable_vector(&hdev->misc_vector, true);
...@@ -8826,21 +8827,25 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev) ...@@ -8826,21 +8827,25 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
} }
/* Re-enable the hw error interrupts because /* Re-enable the hw error interrupts because
* the interrupts get disabled on core/global reset. * the interrupts get disabled on global reset.
*/ */
ret = hclge_hw_error_set_state(hdev, true); ret = hclge_config_nic_hw_error(hdev, true);
if (ret) { if (ret) {
dev_err(&pdev->dev, dev_err(&pdev->dev,
"fail(%d) to re-enable HNS hw error interrupts\n", ret); "fail(%d) to re-enable NIC hw error interrupts\n",
ret);
return ret; return ret;
} }
ret = hclge_clear_all_ras_errors(hdev); if (hdev->roce_client) {
ret = hclge_config_rocee_ras_interrupt(hdev, true);
if (ret) { if (ret) {
dev_err(&pdev->dev, dev_err(&ae_dev->pdev->dev,
"fail(%d) to clear all ras states\n", ret); "fail(%d) to re-enable roce ras interrupts\n",
ret);
return ret; return ret;
} }
}
hclge_reset_vport_state(hdev); hclge_reset_vport_state(hdev);
...@@ -8866,8 +8871,11 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) ...@@ -8866,8 +8871,11 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
hclge_enable_vector(&hdev->misc_vector, false); hclge_enable_vector(&hdev->misc_vector, false);
synchronize_irq(hdev->misc_vector.vector_irq); synchronize_irq(hdev->misc_vector.vector_irq);
/* Disable all hw interrupts */
hclge_config_mac_tnl_int(hdev, false); hclge_config_mac_tnl_int(hdev, false);
hclge_hw_error_set_state(hdev, false); hclge_config_nic_hw_error(hdev, false);
hclge_config_rocee_ras_interrupt(hdev, false);
hclge_cmd_uninit(hdev); hclge_cmd_uninit(hdev);
hclge_misc_irq_uninit(hdev); hclge_misc_irq_uninit(hdev);
hclge_pci_uninit(hdev); hclge_pci_uninit(hdev);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册