Commit 77bd7415 authored by Linas Vepstas's avatar Linas Vepstas Committed by Paul Mackerras

[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines

Various PCI bus errors can be signaled by newer PCI controllers.  The
core error recovery routines are architecture dependent.  This patch adds
a recovery infrastructure for the  PPC64 pSeries systems.
Signed-off-by: default avatarLinas Vepstas <linas@austin.ibm.com>
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
(cherry picked from e8ca11b460c4c9c7fa6b529be221529ebd770e38 commit)
parent 97712717
......@@ -4,7 +4,7 @@ obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_IBMVIO) += vio.o
obj-$(CONFIG_XICS) += xics.o
obj-$(CONFIG_SCANLOG) += scanlog.o
obj-$(CONFIG_EEH) += eeh.o eeh_event.o
obj-$(CONFIG_EEH) += eeh.o eeh_driver.o eeh_event.o
obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o
obj-$(CONFIG_HVCS) += hvcserver.o
......@@ -485,6 +485,11 @@ static void __eeh_mark_slot (struct device_node *dn, int mode_flag)
if (PCI_DN(dn)) {
PCI_DN(dn)->eeh_mode |= mode_flag;
/* Mark the pci device driver too */
struct pci_dev *dev = PCI_DN(dn)->pcidev;
if (dev && dev->driver)
dev->error_state = pci_channel_io_frozen;
if (dn->child)
__eeh_mark_slot (dn->child, mode_flag);
}
......@@ -544,6 +549,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
int rets[3];
unsigned long flags;
struct pci_dn *pdn;
enum pci_channel_state state;
int rc = 0;
__get_cpu_var(total_mmio_ffs)++;
......@@ -648,7 +654,12 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
eeh_mark_slot (dn, EEH_MODE_ISOLATED);
spin_unlock_irqrestore(&confirm_error_lock, flags);
eeh_send_failure_event (dn, dev, rets[0], rets[2]);
state = pci_channel_io_normal;
if ((rets[0] == 2) || (rets[0] == 4))
state = pci_channel_io_frozen;
if (rets[0] == 5)
state = pci_channel_io_perm_failure;
eeh_send_failure_event (dn, dev, state, rets[2]);
/* Most EEH events are due to device driver bugs. Having
* a stack trace will help the device-driver authors figure
......@@ -953,8 +964,10 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
* But there are a few cases like display devices that make sense.
*/
enable = 1; /* i.e. we will do checking */
#if 0
if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY)
enable = 0;
#endif
if (!enable)
pdn->eeh_mode |= EEH_MODE_NOCHECK;
......
This diff is collapsed.
......@@ -21,6 +21,7 @@
#include <linux/list.h>
#include <linux/pci.h>
#include <asm/eeh_event.h>
#include <asm/ppc-pci.h>
/** Overview:
* EEH error states may be detected within exception handlers;
......@@ -36,31 +37,6 @@ LIST_HEAD(eeh_eventlist);
static void eeh_thread_launcher(void *);
DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
/**
* eeh_panic - call panic() for an eeh event that cannot be handled.
* The philosophy of this routine is that it is better to panic and
* halt the OS than it is to risk possible data corruption by
* oblivious device drivers that don't know better.
*
* @dev pci device that had an eeh event
* @reset_state current reset state of the device slot
*/
static void eeh_panic(struct pci_dev *dev, int reset_state)
{
/*
* Since the panic_on_oops sysctl is used to halt the system
* in light of potential corruption, we can use it here.
*/
if (panic_on_oops) {
panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
pci_name(dev));
}
else {
printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
reset_state, pci_name(dev));
}
}
/**
* eeh_event_handler - dispatch EEH events. The detection of a frozen
* slot can occur inside an interrupt, where it can be hard to do
......@@ -82,10 +58,16 @@ static int eeh_event_handler(void * dummy)
spin_lock_irqsave(&eeh_eventlist_lock, flags);
event = NULL;
/* Unqueue the event, get ready to process. */
if (!list_empty(&eeh_eventlist)) {
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
list_del(&event->list);
}
if (event)
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
if (event == NULL)
break;
......@@ -93,8 +75,11 @@ static int eeh_event_handler(void * dummy)
printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
pci_name(event->dev));
eeh_panic (event->dev, event->state);
handle_eeh_events(event);
eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
pci_dev_put(event->dev);
kfree(event);
}
......@@ -122,7 +107,7 @@ static void eeh_thread_launcher(void *dummy)
*/
int eeh_send_failure_event (struct device_node *dn,
struct pci_dev *dev,
int state,
enum pci_channel_state state,
int time_unavail)
{
unsigned long flags;
......
......@@ -37,6 +37,8 @@ extern int eeh_subsystem_enabled;
#define EEH_MODE_SUPPORTED (1<<0)
#define EEH_MODE_NOCHECK (1<<1)
#define EEH_MODE_ISOLATED (1<<2)
#define EEH_MODE_RECOVERING (1<<3)
#define EEH_MODE_IRQ_DISABLED (1<<4)
/* Max number of EEH freezes allowed before we consider the device
* to be permanently disabled. */
......
......@@ -30,7 +30,7 @@ struct eeh_event {
struct list_head list;
struct device_node *dn; /* struct device node */
struct pci_dev *dev; /* affected device */
int state;
enum pci_channel_state state; /* PCI bus state for the affected device */
int time_unavail; /* milliseconds until device might be available */
};
......@@ -47,8 +47,11 @@ struct eeh_event {
*/
int eeh_send_failure_event (struct device_node *dn,
struct pci_dev *dev,
int reset_state,
enum pci_channel_state state,
int time_unavail);
/* Main recovery function */
void handle_eeh_events (struct eeh_event *);
#endif /* __KERNEL__ */
#endif /* ASM_PPC64_EEH_EVENT_H */
......@@ -52,6 +52,15 @@ extern unsigned long pci_probe_only;
/* ---- EEH internal-use-only related routines ---- */
#ifdef CONFIG_EEH
/**
* eeh_slot_error_detail -- record and EEH error condition to the log
* @severity: 1 if temporary, 2 if permanent failure.
*
* Obtains the the EEH error details from the RTAS subsystem,
* and then logs these details with the RTAS error log system.
*/
void eeh_slot_error_detail (struct pci_dn *pdn, int severity);
/**
* rtas_set_slot_reset -- unfreeze a frozen slot
*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment