diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index d8d24502970f7ba5629674251260eed178107886..b0fa76d0c78ace1708a2bf6f2de15bc63b698f83 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -450,11 +450,16 @@ eeh_slot_availability(struct pci_dn *pdn) if (rc) return rc; if (rets[1] == 0) return -1; /* EEH is not supported */ - if (rets[0] == 0) return 0; /* Oll Korrect */ + if (rets[0] == 0) return 0; /* Oll Korrect */ if (rets[0] == 5) { if (rets[2] == 0) return -1; /* permanently unavailable */ return rets[2]; /* number of millisecs to wait */ } + if (rets[0] == 1) + return 250; + + printk (KERN_ERR "EEH: Slot unavailable: rc=%d, rets=%d %d %d\n", + rc, rets[0], rets[1], rets[2]); return -1; } @@ -501,9 +506,11 @@ rtas_pci_slot_reset(struct pci_dn *pdn, int state) /** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second * dn -- device node to be reset. + * + * Return 0 if success, else a non-zero value. */ -void +int rtas_set_slot_reset(struct pci_dn *pdn) { int i, rc; @@ -533,10 +540,21 @@ rtas_set_slot_reset(struct pci_dn *pdn) * ready to be used; if not, wait for recovery. */ for (i=0; i<10; i++) { rc = eeh_slot_availability (pdn); - if (rc <= 0) break; + if (rc < 0) + printk (KERN_ERR "EEH: failed (%d) to reset slot %s\n", rc, pdn->node->full_name); + if (rc == 0) + return 0; + if (rc < 0) + return -1; msleep (rc+100); } + + rc = eeh_slot_availability (pdn); + if (rc) + printk (KERN_ERR "EEH: timeout resetting slot %s\n", pdn->node->full_name); + + return rc; } /* ------------------------------------------------------- */ diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 199a3ce547866e7ec6bc0bcdb722a96807c7a661..242b2923360d08922fb82b3c073e122b3dd3759e 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c @@ -200,14 +200,18 @@ static void eeh_report_failure(struct pci_dev *dev, void *userdata) * bus resets can be performed. */ -static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) +static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) { + int rc; if (bus) pcibios_remove_pci_devices(bus); /* Reset the pci controller. (Asserts RST#; resets config space). - * Reconfigure bridges and devices */ - rtas_set_slot_reset(pe_dn); + * Reconfigure bridges and devices. Don't try to bring the system + * up if the reset failed for some reason. */ + rc = rtas_set_slot_reset(pe_dn); + if (rc) + return rc; /* Walk over all functions on this device */ rtas_configure_bridge(pe_dn); @@ -223,6 +227,8 @@ static void eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) ssleep (5); pcibios_add_pci_devices(bus); } + + return 0; } /* The longest amount of time to wait for a pci device @@ -235,7 +241,7 @@ void handle_eeh_events (struct eeh_event *event) struct device_node *frozen_dn; struct pci_dn *frozen_pdn; struct pci_bus *frozen_bus; - int perm_failure = 0; + int rc = 0; frozen_dn = find_device_pe(event->dn); frozen_bus = pcibios_find_pci_bus(frozen_dn); @@ -272,7 +278,7 @@ void handle_eeh_events (struct eeh_event *event) frozen_pdn->eeh_freeze_count++; if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) - perm_failure = 1; + goto hard_fail; /* If the reset state is a '5' and the time to reset is 0 (infinity) * or is more then 15 seconds, then mark this as a permanent failure. @@ -280,34 +286,7 @@ void handle_eeh_events (struct eeh_event *event) if ((event->state == pci_channel_io_perm_failure) && ((event->time_unavail <= 0) || (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) - { - perm_failure = 1; - } - - /* Log the error with the rtas logger. */ - if (perm_failure) { - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - printk(KERN_ERR - "EEH: PCI device %s - %s has failed %d times \n" - "and has been permanently disabled. Please try reseating\n" - "this device or replacing it.\n", - pci_name (frozen_pdn->pcidev), - pcid_name(frozen_pdn->pcidev), - frozen_pdn->eeh_freeze_count); - - eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); - - /* Notify all devices that they're about to go down. */ - pci_walk_bus(frozen_bus, eeh_report_failure, 0); - - /* Shut down the device drivers for good. */ - pcibios_remove_pci_devices(frozen_bus); - return; - } + goto hard_fail; eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); printk(KERN_WARNING @@ -330,24 +309,54 @@ void handle_eeh_events (struct eeh_event *event) * go down willingly, without panicing the system. */ if (result == PCIERR_RESULT_NONE) { - eeh_reset_device(frozen_pdn, frozen_bus); + rc = eeh_reset_device(frozen_pdn, frozen_bus); + if (rc) + goto hard_fail; } /* If any device called out for a reset, then reset the slot */ if (result == PCIERR_RESULT_NEED_RESET) { - eeh_reset_device(frozen_pdn, NULL); + rc = eeh_reset_device(frozen_pdn, NULL); + if (rc) + goto hard_fail; pci_walk_bus(frozen_bus, eeh_report_reset, 0); } /* If all devices reported they can proceed, the re-enable PIO */ if (result == PCIERR_RESULT_CAN_RECOVER) { /* XXX Not supported; we brute-force reset the device */ - eeh_reset_device(frozen_pdn, NULL); + rc = eeh_reset_device(frozen_pdn, NULL); + if (rc) + goto hard_fail; pci_walk_bus(frozen_bus, eeh_report_reset, 0); } /* Tell all device drivers that they can resume operations */ pci_walk_bus(frozen_bus, eeh_report_resume, 0); + + return; + +hard_fail: + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + printk(KERN_ERR + "EEH: PCI device %s - %s has failed %d times \n" + "and has been permanently disabled. Please try reseating\n" + "this device or replacing it.\n", + pci_name (frozen_pdn->pcidev), + pcid_name(frozen_pdn->pcidev), + frozen_pdn->eeh_freeze_count); + + eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); + + /* Notify all devices that they're about to go down. */ + pci_walk_bus(frozen_bus, eeh_report_failure, 0); + + /* Shut down the device drivers for good. */ + pcibios_remove_pci_devices(frozen_bus); } /* ---------- end of file ---------- */ diff --git a/include/asm-powerpc/ppc-pci.h b/include/asm-powerpc/ppc-pci.h index 4820b368bf157894cf93f5d2e979648c4754c0ca..1a2db61694f21468d3502e387b70ecb7e52d0797 100644 --- a/include/asm-powerpc/ppc-pci.h +++ b/include/asm-powerpc/ppc-pci.h @@ -76,8 +76,10 @@ void eeh_slot_error_detail (struct pci_dn *pdn, int severity); * does this by asserting the PCI #RST line for 1/8th of * a second; this routine will sleep while the adapter is * being reset. + * + * Returns a non-zero value if the reset failed. */ -void rtas_set_slot_reset (struct pci_dn *); +int rtas_set_slot_reset (struct pci_dn *); /** * eeh_restore_bars - Restore device configuration info.