Commit c0d12172 authored by Dave Jiang's avatar Dave Jiang Committed by Linus Torvalds

drivers/edac: add new nmi rescan

Provides a way for NMI reported errors on x86 to notify the EDAC
subsystem pending ECC errors by writing to a software state variable.

Here's the reworked patch. I added an EDAC stub to the kernel so we can
have variables that are in the kernel even if EDAC is a module. I also
implemented the idea of using the chip driver to select error detection
mode via module parameter and eliminate the kernel compile option.
Please review/test. Thx!

Also, I only made changes to some of the chipset drivers since I am
unfamiliar with the other ones. We can add similar changes as we go.
Signed-off-by: default avatarDave Jiang <djiang@mvista.com>
Signed-off-by: default avatarDouglas Thompson <dougthompson@xmission.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 28f96eea
...@@ -41,6 +41,10 @@ ...@@ -41,6 +41,10 @@
#include <linux/mca.h> #include <linux/mca.h>
#endif #endif
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/system.h> #include <asm/system.h>
#include <asm/io.h> #include <asm/io.h>
...@@ -638,6 +642,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) ...@@ -638,6 +642,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
"CPU %d.\n", reason, smp_processor_id()); "CPU %d.\n", reason, smp_processor_id());
printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
#if defined(CONFIG_EDAC)
if(edac_handler_set()) {
edac_atomic_assert_error();
return;
}
#endif
if (panic_on_unrecovered_nmi) if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing"); panic("NMI: Not continuing");
......
...@@ -34,6 +34,10 @@ ...@@ -34,6 +34,10 @@
#include <linux/bug.h> #include <linux/bug.h>
#include <linux/kdebug.h> #include <linux/kdebug.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
#include <asm/system.h> #include <asm/system.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/atomic.h> #include <asm/atomic.h>
...@@ -719,6 +723,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) ...@@ -719,6 +723,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
reason); reason);
printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
#if defined(CONFIG_EDAC)
if(edac_handler_set()) {
edac_atomic_assert_error();
return;
}
#endif
if (panic_on_unrecovered_nmi) if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing"); panic("NMI: Not continuing");
......
...@@ -109,15 +109,4 @@ config EDAC_I5000 ...@@ -109,15 +109,4 @@ config EDAC_I5000
Support for error detection and correction the Intel Support for error detection and correction the Intel
Greekcreek/Blackford chipsets. Greekcreek/Blackford chipsets.
choice
prompt "Error detecting method"
default EDAC_POLL
config EDAC_POLL
bool "Poll for errors"
help
Poll the chipset periodically to detect errors.
endchoice
endif # EDAC endif # EDAC
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
# This file may be distributed under the terms of the # This file may be distributed under the terms of the
# GNU General Public License. # GNU General Public License.
# #
# $Id: Makefile,v 1.4.2.3 2005/07/08 22:05:38 dsp_llnl Exp $
obj-$(CONFIG_EDAC) := edac_stub.o
obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o
edac_core-objs := edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o edac_core-objs := edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/pci_ids.h> #include <linux/pci_ids.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/edac.h>
#include "edac_mc.h" #include "edac_mc.h"
#define E752X_REVISION " Ver: 2.0.1 " __DATE__ #define E752X_REVISION " Ver: 2.0.1 " __DATE__
...@@ -948,6 +949,16 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx) ...@@ -948,6 +949,16 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx)
debugf0("%s(): mci\n", __func__); debugf0("%s(): mci\n", __func__);
debugf0("Starting Probe1\n"); debugf0("Starting Probe1\n");
/* make sure error reporting method is sane */
switch(edac_op_state) {
case EDAC_OPSTATE_POLL:
case EDAC_OPSTATE_NMI:
break;
default:
edac_op_state = EDAC_OPSTATE_POLL;
break;
}
/* check to see if device 0 function 1 is enabled; if it isn't, we /* check to see if device 0 function 1 is enabled; if it isn't, we
* assume the BIOS has reserved it for a reason and is expecting * assume the BIOS has reserved it for a reason and is expecting
* exclusive access, we take care not to violate that assumption and * exclusive access, we take care not to violate that assumption and
...@@ -1123,4 +1134,5 @@ MODULE_DESCRIPTION("MC support for Intel e752x memory controllers"); ...@@ -1123,4 +1134,5 @@ MODULE_DESCRIPTION("MC support for Intel e752x memory controllers");
module_param(force_function_unhide, int, 0444); module_param(force_function_unhide, int, 0444);
MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:" MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:"
" 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access"); " 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access");
module_param(edac_op_state, int, 0444);
MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/pci_ids.h> #include <linux/pci_ids.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/edac.h>
#include "edac_mc.h" #include "edac_mc.h"
#define E7XXX_REVISION " Ver: 2.0.1 " __DATE__ #define E7XXX_REVISION " Ver: 2.0.1 " __DATE__
...@@ -419,6 +420,17 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx) ...@@ -419,6 +420,17 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx)
struct e7xxx_error_info discard; struct e7xxx_error_info discard;
debugf0("%s(): mci\n", __func__); debugf0("%s(): mci\n", __func__);
/* make sure error reporting method is sane */
switch(edac_op_state) {
case EDAC_OPSTATE_POLL:
case EDAC_OPSTATE_NMI:
break;
default:
edac_op_state = EDAC_OPSTATE_POLL;
break;
}
pci_read_config_dword(pdev, E7XXX_DRC, &drc); pci_read_config_dword(pdev, E7XXX_DRC, &drc);
drc_chan = dual_channel_active(drc, dev_idx); drc_chan = dual_channel_active(drc, dev_idx);
...@@ -565,3 +577,5 @@ MODULE_LICENSE("GPL"); ...@@ -565,3 +577,5 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n" MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n"
"Based on.work by Dan Hollis et al"); "Based on.work by Dan Hollis et al");
MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers"); MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers");
module_param(edac_op_state, int, 0444);
MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/sysdev.h> #include <linux/sysdev.h>
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/edac.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/page.h> #include <asm/page.h>
#include <asm/edac.h> #include <asm/edac.h>
...@@ -241,6 +242,7 @@ static int add_mc_to_global_list (struct mem_ctl_info *mci) ...@@ -241,6 +242,7 @@ static int add_mc_to_global_list (struct mem_ctl_info *mci)
} }
list_add_tail_rcu(&mci->link, insert_before); list_add_tail_rcu(&mci->link, insert_before);
atomic_inc(&edac_handlers);
return 0; return 0;
fail0: fail0:
...@@ -267,6 +269,7 @@ static void complete_mc_list_del(struct rcu_head *head) ...@@ -267,6 +269,7 @@ static void complete_mc_list_del(struct rcu_head *head)
static void del_mc_from_global_list(struct mem_ctl_info *mci) static void del_mc_from_global_list(struct mem_ctl_info *mci)
{ {
atomic_dec(&edac_handlers);
list_del_rcu(&mci->link); list_del_rcu(&mci->link);
init_completion(&mci->complete); init_completion(&mci->complete);
call_rcu(&mci->rcu, complete_mc_list_del); call_rcu(&mci->rcu, complete_mc_list_del);
......
#include <linux/freezer.h> #include <linux/freezer.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/edac.h>
#include "edac_mc.h" #include "edac_mc.h"
#include "edac_module.h" #include "edac_module.h"
...@@ -101,6 +102,25 @@ static void do_edac_check(void) ...@@ -101,6 +102,25 @@ static void do_edac_check(void)
edac_pci_do_parity_check(); edac_pci_do_parity_check();
} }
/*
* handler for EDAC to check if NMI type handler has asserted interrupt
*/
static int edac_assert_error_check_and_clear(void)
{
int vreg;
if(edac_op_state == EDAC_OPSTATE_POLL)
return 1;
vreg = atomic_read(&edac_err_assert);
if(vreg) {
atomic_set(&edac_err_assert, 0);
return 1;
}
return 0;
}
/* /*
* Action thread for EDAC to perform the POLL operations * Action thread for EDAC to perform the POLL operations
*/ */
...@@ -109,8 +129,8 @@ static int edac_kernel_thread(void *arg) ...@@ -109,8 +129,8 @@ static int edac_kernel_thread(void *arg)
int msec; int msec;
while (!kthread_should_stop()) { while (!kthread_should_stop()) {
if(edac_assert_error_check_and_clear())
do_edac_check(); do_edac_check();
/* goto sleep for the interval */ /* goto sleep for the interval */
msec = (HZ * edac_get_poll_msec()) / 1000; msec = (HZ * edac_get_poll_msec()) / 1000;
......
/*
* common EDAC components that must be in kernel
*
* Author: Dave Jiang <djiang@mvista.com>
*
* 2007 (c) MontaVista Software, Inc. This file is licensed under
* the terms of the GNU General Public License version 2. This program
* is licensed "as is" without any warranty of any kind, whether express
* or implied.
*
*/
#include <linux/module.h>
#include <linux/edac.h>
#include <asm/atomic.h>
#include <asm/edac.h>
int edac_op_state = EDAC_OPSTATE_INVAL;
EXPORT_SYMBOL(edac_op_state);
atomic_t edac_handlers = ATOMIC_INIT(0);
EXPORT_SYMBOL(edac_handlers);
atomic_t edac_err_assert = ATOMIC_INIT(0);
EXPORT_SYMBOL(edac_err_assert);
inline int edac_handler_set(void)
{
if (edac_op_state == EDAC_OPSTATE_POLL)
return 0;
return atomic_read(&edac_handlers);
}
EXPORT_SYMBOL(edac_handler_set);
/*
* handler for NMI type of interrupts to assert error
*/
inline void edac_atomic_assert_error(void)
{
atomic_set(&edac_err_assert, 1);
}
EXPORT_SYMBOL(edac_atomic_assert_error);
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/pci_ids.h> #include <linux/pci_ids.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/edac.h>
#include <asm/mmzone.h> #include <asm/mmzone.h>
#include "edac_mc.h" #include "edac_mc.h"
...@@ -1285,6 +1286,16 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx) ...@@ -1285,6 +1286,16 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
if (PCI_FUNC(pdev->devfn) != 0) if (PCI_FUNC(pdev->devfn) != 0)
return -ENODEV; return -ENODEV;
/* make sure error reporting method is sane */
switch(edac_op_state) {
case EDAC_OPSTATE_POLL:
case EDAC_OPSTATE_NMI:
break;
default:
edac_op_state = EDAC_OPSTATE_POLL;
break;
}
/* Ask the devices for the number of CSROWS and CHANNELS so /* Ask the devices for the number of CSROWS and CHANNELS so
* that we can calculate the memory resources, etc * that we can calculate the memory resources, etc
* *
...@@ -1475,3 +1486,5 @@ MODULE_AUTHOR ...@@ -1475,3 +1486,5 @@ MODULE_AUTHOR
("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>"); ("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>");
MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - " MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - "
I5000_REVISION); I5000_REVISION);
module_param(edac_op_state, int, 0444);
MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
/*
* Generic EDAC defs
*
* Author: Dave Jiang <djiang@mvista.com>
*
* 2006-2007 (c) MontaVista Software, Inc. This file is licensed under
* the terms of the GNU General Public License version 2. This program
* is licensed "as is" without any warranty of any kind, whether express
* or implied.
*
*/
#ifndef _LINUX_EDAC_H_
#define _LINUX_EDAC_H_
#include <asm/atomic.h>
#define EDAC_OPSTATE_INVAL -1
#define EDAC_OPSTATE_POLL 0
#define EDAC_OPSTATE_NMI 1
#define EDAC_OPSTATE_INT 2
extern int edac_op_state;
extern atomic_t edac_handlers;
extern atomic_t edac_err_assert;
extern int edac_handler_set(void);
extern void edac_atomic_assert_error(void);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment