提交 ac325acd 编写于 作者: L Linas Vepstas 提交者: Paul Mackerras

[PATCH] powerpc/pseries: clear PCI failure counter if no new failures

The current PCI error recovery system keeps track of the number of PCI card
resets, and refuses to bring a card back up if this number is too large.
The goal of doing this was to avoid an infinite loop of resets if a card is
obviously dead.  However, if the failures are rare, but the machine has a
high uptime, this mechanism might still be triggered; this is too harsh.

This patch will avoids this problem by decrementing the fail count after an
hour.  Thus, as long as a pci card BSOD's less than 6 times an hour, it
will continue to be reset indefinitely.  If it's failure rate is greater
than that, it will be taken off-line permanently.

This patch is larger than it might otherwise be because it changes
indentation by removing a pointless while-loop.  The while loop is not
needed, as the handler is invoked once fo each event (by schedule_work());
the loop is leftover cruft from an earlier implementation.
Signed-off-by: NLinas Vepstas <linas@austin.ibm.com>
Signed-off-by: NAndrew Morton <akpm@osdl.org>
Signed-off-by: NPaul Mackerras <paulus@samba.org>
上级 4bd174fe
...@@ -23,9 +23,8 @@ ...@@ -23,9 +23,8 @@
* *
*/ */
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/irq.h>
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/notifier.h> #include <linux/irq.h>
#include <linux/pci.h> #include <linux/pci.h>
#include <asm/eeh.h> #include <asm/eeh.h>
#include <asm/eeh_event.h> #include <asm/eeh_event.h>
...@@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) ...@@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
*/ */
#define MAX_WAIT_FOR_RECOVERY 15 #define MAX_WAIT_FOR_RECOVERY 15
void handle_eeh_events (struct eeh_event *event) struct pci_dn * handle_eeh_events (struct eeh_event *event)
{ {
struct device_node *frozen_dn; struct device_node *frozen_dn;
struct pci_dn *frozen_pdn; struct pci_dn *frozen_pdn;
...@@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event *event) ...@@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event *event)
if (!frozen_dn) { if (!frozen_dn) {
printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n", printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n",
pci_name(event->dev)); pci_name(event->dev));
return; return NULL;
} }
/* There are two different styles for coming up with the PE. /* There are two different styles for coming up with the PE.
...@@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event *event) ...@@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event *event)
if (!frozen_bus) { if (!frozen_bus) {
printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n", printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n",
frozen_dn->full_name); frozen_dn->full_name);
return; return NULL;
} }
#if 0 #if 0
...@@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event *event) ...@@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event *event)
/* Tell all device drivers that they can resume operations */ /* Tell all device drivers that they can resume operations */
pci_walk_bus(frozen_bus, eeh_report_resume, NULL); pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
return; return frozen_pdn;
excess_failures: excess_failures:
/* /*
...@@ -384,6 +383,8 @@ void handle_eeh_events (struct eeh_event *event) ...@@ -384,6 +383,8 @@ void handle_eeh_events (struct eeh_event *event)
/* Shut down the device drivers for good. */ /* Shut down the device drivers for good. */
pcibios_remove_pci_devices(frozen_bus); pcibios_remove_pci_devices(frozen_bus);
return NULL;
} }
/* ---------- end of file ---------- */ /* ---------- end of file ---------- */
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
* Copyright (c) 2005 Linas Vepstas <linas@linas.org> * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
*/ */
#include <linux/delay.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/pci.h> #include <linux/pci.h>
...@@ -56,38 +57,43 @@ static int eeh_event_handler(void * dummy) ...@@ -56,38 +57,43 @@ static int eeh_event_handler(void * dummy)
{ {
unsigned long flags; unsigned long flags;
struct eeh_event *event; struct eeh_event *event;
struct pci_dn *pdn;
daemonize ("eehd"); daemonize ("eehd");
set_current_state(TASK_INTERRUPTIBLE);
while (1) { spin_lock_irqsave(&eeh_eventlist_lock, flags);
set_current_state(TASK_INTERRUPTIBLE); event = NULL;
spin_lock_irqsave(&eeh_eventlist_lock, flags); /* Unqueue the event, get ready to process. */
event = NULL; if (!list_empty(&eeh_eventlist)) {
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
list_del(&event->list);
}
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
/* Unqueue the event, get ready to process. */ if (event == NULL)
if (!list_empty(&eeh_eventlist)) { return 0;
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
list_del(&event->list);
}
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
if (event == NULL) /* Serialize processing of EEH events */
break; mutex_lock(&eeh_event_mutex);
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
/* Serialize processing of EEH events */ printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
mutex_lock(&eeh_event_mutex); pci_name(event->dev));
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", pdn = handle_eeh_events(event);
pci_name(event->dev));
handle_eeh_events(event); eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
pci_dev_put(event->dev);
kfree(event);
mutex_unlock(&eeh_event_mutex);
eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); /* If there are no new errors after an hour, clear the counter. */
pci_dev_put(event->dev); if (pdn && pdn->eeh_freeze_count>0) {
kfree(event); msleep_interruptible (3600*1000);
mutex_unlock(&eeh_event_mutex); if (pdn->eeh_freeze_count>0)
pdn->eeh_freeze_count--;
} }
return 0; return 0;
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
* Copyright (c) 2005 Linas Vepstas <linas@linas.org> * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
*/ */
#ifndef ASM_PPC64_EEH_EVENT_H #ifndef ASM_POWERPC_EEH_EVENT_H
#define ASM_PPC64_EEH_EVENT_H #define ASM_POWERPC_EEH_EVENT_H
#ifdef __KERNEL__ #ifdef __KERNEL__
/** EEH event -- structure holding pci controller data that describes /** EEH event -- structure holding pci controller data that describes
...@@ -39,7 +39,7 @@ struct eeh_event { ...@@ -39,7 +39,7 @@ struct eeh_event {
* @dev pci device * @dev pci device
* *
* This routine builds a PCI error event which will be delivered * This routine builds a PCI error event which will be delivered
* to all listeners on the peh_notifier_chain. * to all listeners on the eeh_notifier_chain.
* *
* This routine can be called within an interrupt context; * This routine can be called within an interrupt context;
* the actual event will be delivered in a normal context * the actual event will be delivered in a normal context
...@@ -51,7 +51,7 @@ int eeh_send_failure_event (struct device_node *dn, ...@@ -51,7 +51,7 @@ int eeh_send_failure_event (struct device_node *dn,
int time_unavail); int time_unavail);
/* Main recovery function */ /* Main recovery function */
void handle_eeh_events (struct eeh_event *); struct pci_dn * handle_eeh_events (struct eeh_event *);
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* ASM_PPC64_EEH_EVENT_H */ #endif /* ASM_POWERPC_EEH_EVENT_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册