From 2e5eda4681f91bce8f65c54c684c79397708fd80 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 20 May 2020 16:35:08 +0300 Subject: [PATCH] habanalabs: PCIe Advanced Error Reporting support driver will now get notified upon any PCI error occurred and will respond according to the severity of the error. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/habanalabs_drv.c | 76 ++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index c6b31e93fb5e..f9067d3ef437 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -11,6 +11,7 @@ #include "habanalabs.h" #include +#include #include #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team" @@ -408,6 +409,8 @@ static int hl_pci_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, hdev); + pci_enable_pcie_error_reporting(pdev); + rc = hl_device_init(hdev, hl_class); if (rc) { dev_err(&pdev->dev, "Fatal error during habanalabs device init\n"); @@ -440,22 +443,93 @@ static void hl_pci_remove(struct pci_dev *pdev) return; hl_device_fini(hdev); + pci_disable_pcie_error_reporting(pdev); pci_set_drvdata(pdev, NULL); - destroy_hdev(hdev); } +/** + * hl_pci_err_detected - a PCI bus error detected on this device + * + * @pdev: pointer to pci device + * @state: PCI error type + * + * Called by the PCI subsystem whenever a non-correctable + * PCI bus error is detected + */ +static pci_ers_result_t +hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct hl_device *hdev = pci_get_drvdata(pdev); + enum pci_ers_result result; + + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + + case pci_channel_io_frozen: + dev_warn(hdev->dev, "frozen state error detected\n"); + result = PCI_ERS_RESULT_NEED_RESET; + break; + + case pci_channel_io_perm_failure: + dev_warn(hdev->dev, "failure state error detected\n"); + result = PCI_ERS_RESULT_DISCONNECT; + break; + + default: + result = PCI_ERS_RESULT_NONE; + } + + hdev->asic_funcs->halt_engines(hdev, true); + + return result; +} + +/** + * hl_pci_err_resume - resume after a PCI slot reset + * + * @pdev: pointer to pci device + * + */ +static void hl_pci_err_resume(struct pci_dev *pdev) +{ + struct hl_device *hdev = pci_get_drvdata(pdev); + + dev_warn(hdev->dev, "Resuming device after PCI slot reset\n"); + hl_device_resume(hdev); +} + +/** + * hl_pci_err_slot_reset - a PCI slot reset has just happened + * + * @pdev: pointer to pci device + * + * Determine if the driver can recover from the PCI slot reset + */ +static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) +{ + return PCI_ERS_RESULT_RECOVERED; +} + static const struct dev_pm_ops hl_pm_ops = { .suspend = hl_pmops_suspend, .resume = hl_pmops_resume, }; +static const struct pci_error_handlers hl_pci_err_handler = { + .error_detected = hl_pci_err_detected, + .slot_reset = hl_pci_err_slot_reset, + .resume = hl_pci_err_resume, +}; + static struct pci_driver hl_pci_driver = { .name = HL_NAME, .id_table = ids, .probe = hl_pci_probe, .remove = hl_pci_remove, .driver.pm = &hl_pm_ops, + .err_handler = &hl_pci_err_handler, }; /* -- GitLab