diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c index 915e947b422d63888dfd65522b7d14a3fa4162ff..9c656fe4983dda7fbb10dd1b195bf12b2808c8e8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/catas.c +++ b/drivers/net/ethernet/mellanox/mlx4/catas.c @@ -69,16 +69,21 @@ static void poll_catas(unsigned long dev_ptr) struct mlx4_priv *priv = mlx4_priv(dev); if (readl(priv->catas_err.map)) { - dump_err_buf(dev); - - mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); + /* If the device is off-line, we cannot try to recover it */ + if (pci_channel_offline(dev->pdev)) + mod_timer(&priv->catas_err.timer, + round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); + else { + dump_err_buf(dev); + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); - if (internal_err_reset) { - spin_lock(&catas_lock); - list_add(&priv->catas_err.list, &catas_list); - spin_unlock(&catas_lock); + if (internal_err_reset) { + spin_lock(&catas_lock); + list_add(&priv->catas_err.list, &catas_list); + spin_unlock(&catas_lock); - queue_work(mlx4_wq, &catas_work); + queue_work(mlx4_wq, &catas_work); + } } } else mod_timer(&priv->catas_err.timer, @@ -100,6 +105,10 @@ static void catas_reset(struct work_struct *work) list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) { struct pci_dev *pdev = priv->dev.pdev; + /* If the device is off-line, we cannot reset it */ + if (pci_channel_offline(pdev)) + continue; + ret = mlx4_restart_one(priv->dev.pdev); /* 'priv' now is not valid */ if (ret) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 7e94987d030c4da44885c5aa3dd56daf1d022907..c8fef435302155d32aa8236e0118c2406241c48c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -296,7 +296,12 @@ int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, static int cmd_pending(struct mlx4_dev *dev) { - u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET); + u32 status; + + if (pci_channel_offline(dev->pdev)) + return -EIO; + + status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET); return (status & swab32(1 << HCR_GO_BIT)) || (mlx4_priv(dev)->cmd.toggle == @@ -314,11 +319,29 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param, mutex_lock(&cmd->hcr_mutex); + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + ret = -EIO; + goto out; + } + end = jiffies; if (event) end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS); while (cmd_pending(dev)) { + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + ret = -EIO; + goto out; + } + if (time_after_eq(jiffies, end)) { mlx4_err(dev, "%s:cmd_pending failed\n", __func__); goto out; @@ -431,14 +454,33 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, down(&priv->cmd.poll_sem); + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + err = -EIO; + goto out; + } + err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0, in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0); if (err) goto out; end = msecs_to_jiffies(timeout) + jiffies; - while (cmd_pending(dev) && time_before(jiffies, end)) + while (cmd_pending(dev) && time_before(jiffies, end)) { + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + err = -EIO; + goto out; + } + cond_resched(); + } if (cmd_pending(dev)) { err = -ETIMEDOUT; @@ -532,6 +574,9 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout, int native) { + if (pci_channel_offline(dev->pdev)) + return -EIO; + if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) { if (mlx4_priv(dev)->cmd.use_events) return mlx4_cmd_wait(dev, in_param, out_param, diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 42645166bae2d3f1de36632f059ac5b597d5e365..e717091734d016fdb4e3d5c612a497d12e887457 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1775,6 +1775,9 @@ static int mlx4_get_ownership(struct mlx4_dev *dev) void __iomem *owner; u32 ret; + if (pci_channel_offline(dev->pdev)) + return -EIO; + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, MLX4_OWNER_SIZE); if (!owner) { @@ -1791,6 +1794,9 @@ static void mlx4_free_ownership(struct mlx4_dev *dev) { void __iomem *owner; + if (pci_channel_offline(dev->pdev)) + return; + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, MLX4_OWNER_SIZE); if (!owner) { @@ -2237,11 +2243,33 @@ static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = { MODULE_DEVICE_TABLE(pci, mlx4_pci_table); +static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + mlx4_remove_one(pdev); + + return state == pci_channel_io_perm_failure ? + PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) +{ + int ret = __mlx4_init_one(pdev, NULL); + + return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +static struct pci_error_handlers mlx4_err_handler = { + .error_detected = mlx4_pci_err_detected, + .slot_reset = mlx4_pci_slot_reset, +}; + static struct pci_driver mlx4_driver = { .name = DRV_NAME, .id_table = mlx4_pci_table, .probe = mlx4_init_one, - .remove = __devexit_p(mlx4_remove_one) + .remove = __devexit_p(mlx4_remove_one), + .err_handler = &mlx4_err_handler, }; static int __init mlx4_verify_params(void)