Commit 77bd7415 authored by Linas Vepstas's avatar Linas Vepstas Committed by Paul Mackerras

[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines

Various PCI bus errors can be signaled by newer PCI controllers.  The
core error recovery routines are architecture dependent.  This patch adds
a recovery infrastructure for the  PPC64 pSeries systems.
Signed-off-by: default avatarLinas Vepstas <linas@austin.ibm.com>
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
(cherry picked from e8ca11b460c4c9c7fa6b529be221529ebd770e38 commit)
parent 97712717
...@@ -4,7 +4,7 @@ obj-$(CONFIG_SMP) += smp.o ...@@ -4,7 +4,7 @@ obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMVIO) += vio.o
obj-$(CONFIG_XICS) += xics.o obj-$(CONFIG_XICS) += xics.o
obj-$(CONFIG_SCANLOG) += scanlog.o obj-$(CONFIG_SCANLOG) += scanlog.o
obj-$(CONFIG_EEH) += eeh.o eeh_event.o obj-$(CONFIG_EEH) += eeh.o eeh_driver.o eeh_event.o
obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o
obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_HVCS) += hvcserver.o
...@@ -485,6 +485,11 @@ static void __eeh_mark_slot (struct device_node *dn, int mode_flag) ...@@ -485,6 +485,11 @@ static void __eeh_mark_slot (struct device_node *dn, int mode_flag)
if (PCI_DN(dn)) { if (PCI_DN(dn)) {
PCI_DN(dn)->eeh_mode |= mode_flag; PCI_DN(dn)->eeh_mode |= mode_flag;
/* Mark the pci device driver too */
struct pci_dev *dev = PCI_DN(dn)->pcidev;
if (dev && dev->driver)
dev->error_state = pci_channel_io_frozen;
if (dn->child) if (dn->child)
__eeh_mark_slot (dn->child, mode_flag); __eeh_mark_slot (dn->child, mode_flag);
} }
...@@ -544,6 +549,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) ...@@ -544,6 +549,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
int rets[3]; int rets[3];
unsigned long flags; unsigned long flags;
struct pci_dn *pdn; struct pci_dn *pdn;
enum pci_channel_state state;
int rc = 0; int rc = 0;
__get_cpu_var(total_mmio_ffs)++; __get_cpu_var(total_mmio_ffs)++;
...@@ -648,8 +654,13 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) ...@@ -648,8 +654,13 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
eeh_mark_slot (dn, EEH_MODE_ISOLATED); eeh_mark_slot (dn, EEH_MODE_ISOLATED);
spin_unlock_irqrestore(&confirm_error_lock, flags); spin_unlock_irqrestore(&confirm_error_lock, flags);
eeh_send_failure_event (dn, dev, rets[0], rets[2]); state = pci_channel_io_normal;
if ((rets[0] == 2) || (rets[0] == 4))
state = pci_channel_io_frozen;
if (rets[0] == 5)
state = pci_channel_io_perm_failure;
eeh_send_failure_event (dn, dev, state, rets[2]);
/* Most EEH events are due to device driver bugs. Having /* Most EEH events are due to device driver bugs. Having
* a stack trace will help the device-driver authors figure * a stack trace will help the device-driver authors figure
* out what happened. So print that out. */ * out what happened. So print that out. */
...@@ -953,8 +964,10 @@ static void *early_enable_eeh(struct device_node *dn, void *data) ...@@ -953,8 +964,10 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
* But there are a few cases like display devices that make sense. * But there are a few cases like display devices that make sense.
*/ */
enable = 1; /* i.e. we will do checking */ enable = 1; /* i.e. we will do checking */
#if 0
if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY) if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY)
enable = 0; enable = 0;
#endif
if (!enable) if (!enable)
pdn->eeh_mode |= EEH_MODE_NOCHECK; pdn->eeh_mode |= EEH_MODE_NOCHECK;
......
This diff is collapsed.
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/pci.h> #include <linux/pci.h>
#include <asm/eeh_event.h> #include <asm/eeh_event.h>
#include <asm/ppc-pci.h>
/** Overview: /** Overview:
* EEH error states may be detected within exception handlers; * EEH error states may be detected within exception handlers;
...@@ -36,31 +37,6 @@ LIST_HEAD(eeh_eventlist); ...@@ -36,31 +37,6 @@ LIST_HEAD(eeh_eventlist);
static void eeh_thread_launcher(void *); static void eeh_thread_launcher(void *);
DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
/**
* eeh_panic - call panic() for an eeh event that cannot be handled.
* The philosophy of this routine is that it is better to panic and
* halt the OS than it is to risk possible data corruption by
* oblivious device drivers that don't know better.
*
* @dev pci device that had an eeh event
* @reset_state current reset state of the device slot
*/
static void eeh_panic(struct pci_dev *dev, int reset_state)
{
/*
* Since the panic_on_oops sysctl is used to halt the system
* in light of potential corruption, we can use it here.
*/
if (panic_on_oops) {
panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
pci_name(dev));
}
else {
printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
reset_state, pci_name(dev));
}
}
/** /**
* eeh_event_handler - dispatch EEH events. The detection of a frozen * eeh_event_handler - dispatch EEH events. The detection of a frozen
* slot can occur inside an interrupt, where it can be hard to do * slot can occur inside an interrupt, where it can be hard to do
...@@ -82,10 +58,16 @@ static int eeh_event_handler(void * dummy) ...@@ -82,10 +58,16 @@ static int eeh_event_handler(void * dummy)
spin_lock_irqsave(&eeh_eventlist_lock, flags); spin_lock_irqsave(&eeh_eventlist_lock, flags);
event = NULL; event = NULL;
/* Unqueue the event, get ready to process. */
if (!list_empty(&eeh_eventlist)) { if (!list_empty(&eeh_eventlist)) {
event = list_entry(eeh_eventlist.next, struct eeh_event, list); event = list_entry(eeh_eventlist.next, struct eeh_event, list);
list_del(&event->list); list_del(&event->list);
} }
if (event)
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
spin_unlock_irqrestore(&eeh_eventlist_lock, flags); spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
if (event == NULL) if (event == NULL)
break; break;
...@@ -93,8 +75,11 @@ static int eeh_event_handler(void * dummy) ...@@ -93,8 +75,11 @@ static int eeh_event_handler(void * dummy)
printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
pci_name(event->dev)); pci_name(event->dev));
eeh_panic (event->dev, event->state); handle_eeh_events(event);
eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
pci_dev_put(event->dev);
kfree(event); kfree(event);
} }
...@@ -122,7 +107,7 @@ static void eeh_thread_launcher(void *dummy) ...@@ -122,7 +107,7 @@ static void eeh_thread_launcher(void *dummy)
*/ */
int eeh_send_failure_event (struct device_node *dn, int eeh_send_failure_event (struct device_node *dn,
struct pci_dev *dev, struct pci_dev *dev,
int state, enum pci_channel_state state,
int time_unavail) int time_unavail)
{ {
unsigned long flags; unsigned long flags;
......
...@@ -34,9 +34,11 @@ struct device_node; ...@@ -34,9 +34,11 @@ struct device_node;
extern int eeh_subsystem_enabled; extern int eeh_subsystem_enabled;
/* Values for eeh_mode bits in device_node */ /* Values for eeh_mode bits in device_node */
#define EEH_MODE_SUPPORTED (1<<0) #define EEH_MODE_SUPPORTED (1<<0)
#define EEH_MODE_NOCHECK (1<<1) #define EEH_MODE_NOCHECK (1<<1)
#define EEH_MODE_ISOLATED (1<<2) #define EEH_MODE_ISOLATED (1<<2)
#define EEH_MODE_RECOVERING (1<<3)
#define EEH_MODE_IRQ_DISABLED (1<<4)
/* Max number of EEH freezes allowed before we consider the device /* Max number of EEH freezes allowed before we consider the device
* to be permanently disabled. */ * to be permanently disabled. */
......
...@@ -30,7 +30,7 @@ struct eeh_event { ...@@ -30,7 +30,7 @@ struct eeh_event {
struct list_head list; struct list_head list;
struct device_node *dn; /* struct device node */ struct device_node *dn; /* struct device node */
struct pci_dev *dev; /* affected device */ struct pci_dev *dev; /* affected device */
int state; enum pci_channel_state state; /* PCI bus state for the affected device */
int time_unavail; /* milliseconds until device might be available */ int time_unavail; /* milliseconds until device might be available */
}; };
...@@ -47,8 +47,11 @@ struct eeh_event { ...@@ -47,8 +47,11 @@ struct eeh_event {
*/ */
int eeh_send_failure_event (struct device_node *dn, int eeh_send_failure_event (struct device_node *dn,
struct pci_dev *dev, struct pci_dev *dev,
int reset_state, enum pci_channel_state state,
int time_unavail); int time_unavail);
/* Main recovery function */
void handle_eeh_events (struct eeh_event *);
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* ASM_PPC64_EEH_EVENT_H */ #endif /* ASM_PPC64_EEH_EVENT_H */
...@@ -52,6 +52,15 @@ extern unsigned long pci_probe_only; ...@@ -52,6 +52,15 @@ extern unsigned long pci_probe_only;
/* ---- EEH internal-use-only related routines ---- */ /* ---- EEH internal-use-only related routines ---- */
#ifdef CONFIG_EEH #ifdef CONFIG_EEH
/**
* eeh_slot_error_detail -- record and EEH error condition to the log
* @severity: 1 if temporary, 2 if permanent failure.
*
* Obtains the the EEH error details from the RTAS subsystem,
* and then logs these details with the RTAS error log system.
*/
void eeh_slot_error_detail (struct pci_dn *pdn, int severity);
/** /**
* rtas_set_slot_reset -- unfreeze a frozen slot * rtas_set_slot_reset -- unfreeze a frozen slot
* *
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment