[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines

Various PCI bus errors can be signaled by newer PCI controllers. The core error recovery routines are architecture dependent. This patch adds a recovery infrastructure for the PPC64 pSeries systems. Signed-off-by: Linas Vepstas <linas@austin.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org> (cherry picked from e8ca11b460c4c9c7fa6b529be221529ebd770e38 commit)

[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines
Various PCI bus errors can be signaled by newer PCI controllers. The core error recovery routines are architecture dependent. This patch adds a recovery infrastructure for the PPC64 pSeries systems. Signed-off-by: Linas Vepstas <linas@austin.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org> (cherry picked from e8ca11b460c4c9c7fa6b529be221529ebd770e38 commit)
77bd7415 · Linas Vepstas · Paul Mackerras · 97712717 · 77bd7415 · 77bd7415
Commit 77bd7415 authored Nov 03, 2005 by Linas Vepstas Committed by Paul Mackerras Jan 10, 2006
7 changed files
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_IBMVIO)	+= vio.o
 obj-$(CONFIG_XICS)	+= xics.o
 obj-$(CONFIG_SCANLOG)	+= scanlog.o
-obj-$(CONFIG_EEH)	+= eeh.o eeh_event.o
+obj-$(CONFIG_EEH)	+= eeh.o eeh_driver.o eeh_event.o

 obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
 obj-$(CONFIG_HVCS)		+= hvcserver.o
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -485,6 +485,11 @@ static void __eeh_mark_slot (struct device_node *dn, int mode_flag)
 		if (PCI_DN(dn)) {
 			PCI_DN(dn)->eeh_mode |= mode_flag;

+			/* Mark the pci device driver too */
+			struct pci_dev *dev = PCI_DN(dn)->pcidev;
+			if (dev && dev->driver)
+				dev->error_state = pci_channel_io_frozen;
+
 			if (dn->child)
 				__eeh_mark_slot (dn->child, mode_flag);
 		}
@@ -544,6 +549,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	int rets[3];
 	unsigned long flags;
 	struct pci_dn *pdn;
+	enum pci_channel_state state;
 	int rc = 0;

 	__get_cpu_var(total_mmio_ffs)++;
@@ -648,7 +654,12 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	eeh_mark_slot (dn, EEH_MODE_ISOLATED);
 	spin_unlock_irqrestore(&confirm_error_lock, flags);

-	eeh_send_failure_event (dn, dev, rets[0], rets[2]);
+	state = pci_channel_io_normal;
+	if ((rets[0] == 2) || (rets[0] == 4))
+		state = pci_channel_io_frozen;
+	if (rets[0] == 5)
+		state = pci_channel_io_perm_failure;
+	eeh_send_failure_event (dn, dev, state, rets[2]);

 	/* Most EEH events are due to device driver bugs.  Having
 	 * a stack trace will help the device-driver authors figure
@@ -953,8 +964,10 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
 	 * But there are a few cases like display devices that make sense.
 	 */
 	enable = 1;	/* i.e. we will do checking */
+#if 0
 	if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY)
 		enable = 0;
+#endif

 	if (!enable)
 		pdn->eeh_mode |= EEH_MODE_NOCHECK;

--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -21,6 +21,7 @@
 #include <linux/list.h>
 #include <linux/pci.h>
 #include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>

 /** Overview:
 *  EEH error states may be detected within exception handlers;
@@ -36,31 +37,6 @@ LIST_HEAD(eeh_eventlist);
 static void eeh_thread_launcher(void *);
 DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);

-/**
- * eeh_panic - call panic() for an eeh event that cannot be handled.
- * The philosophy of this routine is that it is better to panic and
- * halt the OS than it is to risk possible data corruption by
- * oblivious device drivers that don't know better.
- *
- * @dev pci device that had an eeh event
- * @reset_state current reset state of the device slot
- */
-static void eeh_panic(struct pci_dev *dev, int reset_state)
-{
-	/*
-	 * Since the panic_on_oops sysctl is used to halt the system
-	 * in light of potential corruption, we can use it here.
-	 */
-	if (panic_on_oops) {
-		panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
-		      pci_name(dev));
-	}
-	else {
-		printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
-		       reset_state, pci_name(dev));
-	}
-}
-
 /**
 * eeh_event_handler - dispatch EEH events.  The detection of a frozen
 * slot can occur inside an interrupt, where it can be hard to do
@@ -82,10 +58,16 @@ static int eeh_event_handler(void * dummy)

 		spin_lock_irqsave(&eeh_eventlist_lock, flags);
 		event = NULL;
+
+		/* Unqueue the event, get ready to process. */
 		if (!list_empty(&eeh_eventlist)) {
 			event = list_entry(eeh_eventlist.next, struct eeh_event, list);
 			list_del(&event->list);
 		}
+		
+		if (event)
+			eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
+
 		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
 		if (event == NULL)
 			break;
@@ -93,8 +75,11 @@ static int eeh_event_handler(void * dummy)
 		printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
 		       pci_name(event->dev));

-		eeh_panic (event->dev, event->state);
+		handle_eeh_events(event);
+
+		eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);

+		pci_dev_put(event->dev);
 		kfree(event);
 	}

@@ -122,7 +107,7 @@ static void eeh_thread_launcher(void *dummy)
 */
 int eeh_send_failure_event (struct device_node *dn,
                            struct pci_dev *dev,
-                            int state,
+                            enum pci_channel_state state,
                            int time_unavail)
 {
 	unsigned long flags;

--- a/include/asm-powerpc/eeh.h
+++ b/include/asm-powerpc/eeh.h
@@ -37,6 +37,8 @@ extern int eeh_subsystem_enabled;
 #define EEH_MODE_SUPPORTED     (1<<0)
 #define EEH_MODE_NOCHECK       (1<<1)
 #define EEH_MODE_ISOLATED      (1<<2)
+#define EEH_MODE_RECOVERING    (1<<3)
+#define EEH_MODE_IRQ_DISABLED  (1<<4)

 /* Max number of EEH freezes allowed before we consider the device
 * to be permanently disabled. */

--- a/include/asm-powerpc/eeh_event.h
+++ b/include/asm-powerpc/eeh_event.h
@@ -30,7 +30,7 @@ struct eeh_event {
 	struct list_head     list;
 	struct device_node 	*dn;   /* struct device node */
 	struct pci_dev       *dev;  /* affected device */
-	int                  state;
+	enum pci_channel_state state; /* PCI bus state for the affected device */
 	int time_unavail;    /* milliseconds until device might be available */
 };

@@ -47,8 +47,11 @@ struct eeh_event {
 */
 int eeh_send_failure_event (struct device_node *dn,
                            struct pci_dev *dev,
-                            int reset_state,
+                            enum pci_channel_state state,
                            int time_unavail);

+/* Main recovery function */
+void handle_eeh_events (struct eeh_event *);
+
 #endif /* __KERNEL__ */
 #endif /* ASM_PPC64_EEH_EVENT_H */
--- a/include/asm-powerpc/ppc-pci.h
+++ b/include/asm-powerpc/ppc-pci.h
@@ -52,6 +52,15 @@ extern unsigned long pci_probe_only;

 /* ---- EEH internal-use-only related routines ---- */
 #ifdef CONFIG_EEH
+/**
+ * eeh_slot_error_detail -- record and EEH error condition to the log
+ * @severity: 1 if temporary, 2 if permanent failure.
+ *
+ * Obtains the the EEH error details from the RTAS subsystem,
+ * and then logs these details with the RTAS error log system.
+ */
+void eeh_slot_error_detail (struct pci_dn *pdn, int severity);
+
 /**
 * rtas_set_slot_reset -- unfreeze a frozen slot
 *