Commit 612b5a8d authored by Corey Minyard's avatar Corey Minyard Committed by Linus Torvalds

IPMI: new NMI handling

Convert over to the new NMI handling for getting IPMI watchdog timeouts via an
NMI.  This add config options to know if there is the ability to receive NMIs
and if it has an NMI post processing call.  Then it modifies the IPMI watchdog
to take advantage of this so that it can know if an NMI comes in.

It also adds testing that the IPMI NMI watchdog works.
Signed-off-by: default avatarCorey Minyard <minyard@acm.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent fcfa4724
...@@ -584,9 +584,11 @@ The watchdog will panic and start a 120 second reset timeout if it ...@@ -584,9 +584,11 @@ The watchdog will panic and start a 120 second reset timeout if it
gets a pre-action. During a panic or a reboot, the watchdog will gets a pre-action. During a panic or a reboot, the watchdog will
start a 120 timer if it is running to make sure the reboot occurs. start a 120 timer if it is running to make sure the reboot occurs.
Note that if you use the NMI preaction for the watchdog, you MUST Note that if you use the NMI preaction for the watchdog, you MUST NOT
NOT use nmi watchdog mode 1. If you use the NMI watchdog, you use the nmi watchdog. There is no reasonable way to tell if an NMI
must use mode 2. comes from the IPMI controller, so it must assume that if it gets an
otherwise unhandled NMI, it must be from IPMI and it will panic
immediately.
Once you open the watchdog timer, you must write a 'V' character to the Once you open the watchdog timer, you must write a 'V' character to the
device to close it, or the timer will not stop. This is a new semantic device to close it, or the timer will not stop. This is a new semantic
......
...@@ -50,10 +50,19 @@ ...@@ -50,10 +50,19 @@
#include <linux/poll.h> #include <linux/poll.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/delay.h>
#include <asm/atomic.h> #include <asm/atomic.h>
#ifdef CONFIG_X86_LOCAL_APIC #ifdef CONFIG_X86
#include <asm/apic.h> /* This is ugly, but I've determined that x86 is the only architecture
that can reasonably support the IPMI NMI watchdog timeout at this
time. If another architecture adds this capability somehow, it
will have to be a somewhat different mechanism and I have no idea
how it will work. So in the unlikely event that another
architecture supports this, we can figure out a good generic
mechanism for it at that time. */
#include <asm/kdebug.h>
#define HAVE_DIE_NMI
#endif #endif
#define PFX "IPMI Watchdog: " #define PFX "IPMI Watchdog: "
...@@ -313,6 +322,11 @@ static unsigned char ipmi_version_minor; ...@@ -313,6 +322,11 @@ static unsigned char ipmi_version_minor;
/* If a pretimeout occurs, this is used to allow only one panic to happen. */ /* If a pretimeout occurs, this is used to allow only one panic to happen. */
static atomic_t preop_panic_excl = ATOMIC_INIT(-1); static atomic_t preop_panic_excl = ATOMIC_INIT(-1);
#ifdef HAVE_DIE_NMI
static int testing_nmi;
static int nmi_handler_registered;
#endif
static int ipmi_heartbeat(void); static int ipmi_heartbeat(void);
/* We use a mutex to make sure that only one thing can send a set /* We use a mutex to make sure that only one thing can send a set
...@@ -352,6 +366,9 @@ static int i_ipmi_set_timeout(struct ipmi_smi_msg *smi_msg, ...@@ -352,6 +366,9 @@ static int i_ipmi_set_timeout(struct ipmi_smi_msg *smi_msg,
int hbnow = 0; int hbnow = 0;
/* These can be cleared as we are setting the timeout. */
pretimeout_since_last_heartbeat = 0;
data[0] = 0; data[0] = 0;
WDOG_SET_TIMER_USE(data[0], WDOG_TIMER_USE_SMS_OS); WDOG_SET_TIMER_USE(data[0], WDOG_TIMER_USE_SMS_OS);
...@@ -426,13 +443,12 @@ static int ipmi_set_timeout(int do_heartbeat) ...@@ -426,13 +443,12 @@ static int ipmi_set_timeout(int do_heartbeat)
wait_for_completion(&set_timeout_wait); wait_for_completion(&set_timeout_wait);
mutex_unlock(&set_timeout_lock);
if ((do_heartbeat == IPMI_SET_TIMEOUT_FORCE_HB) if ((do_heartbeat == IPMI_SET_TIMEOUT_FORCE_HB)
|| ((send_heartbeat_now) || ((send_heartbeat_now)
&& (do_heartbeat == IPMI_SET_TIMEOUT_HB_IF_NECESSARY))) && (do_heartbeat == IPMI_SET_TIMEOUT_HB_IF_NECESSARY)))
{
rv = ipmi_heartbeat(); rv = ipmi_heartbeat();
}
mutex_unlock(&set_timeout_lock);
out: out:
return rv; return rv;
...@@ -556,9 +572,8 @@ static int ipmi_heartbeat(void) ...@@ -556,9 +572,8 @@ static int ipmi_heartbeat(void)
int rv; int rv;
struct ipmi_system_interface_addr addr; struct ipmi_system_interface_addr addr;
if (ipmi_ignore_heartbeat) { if (ipmi_ignore_heartbeat)
return 0; return 0;
}
if (ipmi_start_timer_on_heartbeat) { if (ipmi_start_timer_on_heartbeat) {
ipmi_start_timer_on_heartbeat = 0; ipmi_start_timer_on_heartbeat = 0;
...@@ -569,7 +584,6 @@ static int ipmi_heartbeat(void) ...@@ -569,7 +584,6 @@ static int ipmi_heartbeat(void)
We don't want to set the action, though, we want to We don't want to set the action, though, we want to
leave that alone (thus it can't be combined with the leave that alone (thus it can't be combined with the
above operation. */ above operation. */
pretimeout_since_last_heartbeat = 0;
return ipmi_set_timeout(IPMI_SET_TIMEOUT_HB_IF_NECESSARY); return ipmi_set_timeout(IPMI_SET_TIMEOUT_HB_IF_NECESSARY);
} }
...@@ -927,6 +941,45 @@ static void ipmi_register_watchdog(int ipmi_intf) ...@@ -927,6 +941,45 @@ static void ipmi_register_watchdog(int ipmi_intf)
printk(KERN_CRIT PFX "Unable to register misc device\n"); printk(KERN_CRIT PFX "Unable to register misc device\n");
} }
#ifdef HAVE_DIE_NMI
if (nmi_handler_registered) {
int old_pretimeout = pretimeout;
int old_timeout = timeout;
int old_preop_val = preop_val;
/* Set the pretimeout to go off in a second and give
ourselves plenty of time to stop the timer. */
ipmi_watchdog_state = WDOG_TIMEOUT_RESET;
preop_val = WDOG_PREOP_NONE; /* Make sure nothing happens */
pretimeout = 99;
timeout = 100;
testing_nmi = 1;
rv = ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB);
if (rv) {
printk(KERN_WARNING PFX "Error starting timer to"
" test NMI: 0x%x. The NMI pretimeout will"
" likely not work\n", rv);
rv = 0;
goto out_restore;
}
msleep(1500);
if (testing_nmi != 2) {
printk(KERN_WARNING PFX "IPMI NMI didn't seem to"
" occur. The NMI pretimeout will"
" likely not work\n");
}
out_restore:
testing_nmi = 0;
preop_val = old_preop_val;
pretimeout = old_pretimeout;
timeout = old_timeout;
}
#endif
out: out:
if ((start_now) && (rv == 0)) { if ((start_now) && (rv == 0)) {
/* Run from startup, so start the timer now. */ /* Run from startup, so start the timer now. */
...@@ -934,6 +987,10 @@ static void ipmi_register_watchdog(int ipmi_intf) ...@@ -934,6 +987,10 @@ static void ipmi_register_watchdog(int ipmi_intf)
ipmi_watchdog_state = action_val; ipmi_watchdog_state = action_val;
ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB); ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB);
printk(KERN_INFO PFX "Starting now!\n"); printk(KERN_INFO PFX "Starting now!\n");
} else {
/* Stop the timer now. */
ipmi_watchdog_state = WDOG_TIMEOUT_NONE;
ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB);
} }
} }
...@@ -968,17 +1025,41 @@ static void ipmi_unregister_watchdog(int ipmi_intf) ...@@ -968,17 +1025,41 @@ static void ipmi_unregister_watchdog(int ipmi_intf)
return; return;
} }
#ifdef HAVE_NMI_HANDLER #ifdef HAVE_DIE_NMI
static int static int
ipmi_nmi(void *dev_id, int cpu, int handled) ipmi_nmi(struct notifier_block *self, unsigned long val, void *data)
{ {
struct die_args *args = data;
if (val != DIE_NMI)
return NOTIFY_OK;
/* Hack, if it's a memory or I/O error, ignore it. */
if (args->err & 0xc0)
return NOTIFY_OK;
/*
* If we get here, it's an NMI that's not a memory or I/O
* error. We can't truly tell if it's from IPMI or not
* without sending a message, and sending a message is almost
* impossible because of locking.
*/
if (testing_nmi) {
testing_nmi = 2;
return NOTIFY_STOP;
}
/* If we are not expecting a timeout, ignore it. */ /* If we are not expecting a timeout, ignore it. */
if (ipmi_watchdog_state == WDOG_TIMEOUT_NONE) if (ipmi_watchdog_state == WDOG_TIMEOUT_NONE)
return NOTIFY_DONE; return NOTIFY_OK;
if (preaction_val != WDOG_PRETIMEOUT_NMI)
return NOTIFY_OK;
/* If no one else handled the NMI, we assume it was the IPMI /* If no one else handled the NMI, we assume it was the IPMI
watchdog. */ watchdog. */
if ((!handled) && (preop_val == WDOG_PREOP_PANIC)) { if (preop_val == WDOG_PREOP_PANIC) {
/* On some machines, the heartbeat will give /* On some machines, the heartbeat will give
an error and not work unless we re-enable an error and not work unless we re-enable
the timer. So do so. */ the timer. So do so. */
...@@ -987,18 +1068,12 @@ ipmi_nmi(void *dev_id, int cpu, int handled) ...@@ -987,18 +1068,12 @@ ipmi_nmi(void *dev_id, int cpu, int handled)
panic(PFX "pre-timeout"); panic(PFX "pre-timeout");
} }
return NOTIFY_DONE; return NOTIFY_STOP;
} }
static struct nmi_handler ipmi_nmi_handler = static struct notifier_block ipmi_nmi_handler = {
{ .notifier_call = ipmi_nmi
.link = LIST_HEAD_INIT(ipmi_nmi_handler.link),
.dev_name = "ipmi_watchdog",
.dev_id = NULL,
.handler = ipmi_nmi,
.priority = 0, /* Call us last. */
}; };
int nmi_handler_registered;
#endif #endif
static int wdog_reboot_handler(struct notifier_block *this, static int wdog_reboot_handler(struct notifier_block *this,
...@@ -1115,7 +1190,7 @@ static int preaction_op(const char *inval, char *outval) ...@@ -1115,7 +1190,7 @@ static int preaction_op(const char *inval, char *outval)
preaction_val = WDOG_PRETIMEOUT_NONE; preaction_val = WDOG_PRETIMEOUT_NONE;
else if (strcmp(inval, "pre_smi") == 0) else if (strcmp(inval, "pre_smi") == 0)
preaction_val = WDOG_PRETIMEOUT_SMI; preaction_val = WDOG_PRETIMEOUT_SMI;
#ifdef HAVE_NMI_HANDLER #ifdef HAVE_DIE_NMI
else if (strcmp(inval, "pre_nmi") == 0) else if (strcmp(inval, "pre_nmi") == 0)
preaction_val = WDOG_PRETIMEOUT_NMI; preaction_val = WDOG_PRETIMEOUT_NMI;
#endif #endif
...@@ -1149,7 +1224,7 @@ static int preop_op(const char *inval, char *outval) ...@@ -1149,7 +1224,7 @@ static int preop_op(const char *inval, char *outval)
static void check_parms(void) static void check_parms(void)
{ {
#ifdef HAVE_NMI_HANDLER #ifdef HAVE_DIE_NMI
int do_nmi = 0; int do_nmi = 0;
int rv; int rv;
...@@ -1162,20 +1237,9 @@ static void check_parms(void) ...@@ -1162,20 +1237,9 @@ static void check_parms(void)
preop_op("preop_none", NULL); preop_op("preop_none", NULL);
do_nmi = 0; do_nmi = 0;
} }
#ifdef CONFIG_X86_LOCAL_APIC
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING PFX "nmi_watchdog is set to IO APIC"
" mode (value is %d), that is incompatible"
" with using NMI in the IPMI watchdog."
" Disabling IPMI nmi pretimeout.\n",
nmi_watchdog);
preaction_val = WDOG_PRETIMEOUT_NONE;
do_nmi = 0;
}
#endif
} }
if (do_nmi && !nmi_handler_registered) { if (do_nmi && !nmi_handler_registered) {
rv = request_nmi(&ipmi_nmi_handler); rv = register_die_notifier(&ipmi_nmi_handler);
if (rv) { if (rv) {
printk(KERN_WARNING PFX printk(KERN_WARNING PFX
"Can't register nmi handler\n"); "Can't register nmi handler\n");
...@@ -1183,7 +1247,7 @@ static void check_parms(void) ...@@ -1183,7 +1247,7 @@ static void check_parms(void)
} else } else
nmi_handler_registered = 1; nmi_handler_registered = 1;
} else if (!do_nmi && nmi_handler_registered) { } else if (!do_nmi && nmi_handler_registered) {
release_nmi(&ipmi_nmi_handler); unregister_die_notifier(&ipmi_nmi_handler);
nmi_handler_registered = 0; nmi_handler_registered = 0;
} }
#endif #endif
...@@ -1219,9 +1283,9 @@ static int __init ipmi_wdog_init(void) ...@@ -1219,9 +1283,9 @@ static int __init ipmi_wdog_init(void)
rv = ipmi_smi_watcher_register(&smi_watcher); rv = ipmi_smi_watcher_register(&smi_watcher);
if (rv) { if (rv) {
#ifdef HAVE_NMI_HANDLER #ifdef HAVE_DIE_NMI
if (preaction_val == WDOG_PRETIMEOUT_NMI) if (nmi_handler_registered)
release_nmi(&ipmi_nmi_handler); unregister_die_notifier(&ipmi_nmi_handler);
#endif #endif
atomic_notifier_chain_unregister(&panic_notifier_list, atomic_notifier_chain_unregister(&panic_notifier_list,
&wdog_panic_notifier); &wdog_panic_notifier);
...@@ -1240,9 +1304,9 @@ static void __exit ipmi_wdog_exit(void) ...@@ -1240,9 +1304,9 @@ static void __exit ipmi_wdog_exit(void)
ipmi_smi_watcher_unregister(&smi_watcher); ipmi_smi_watcher_unregister(&smi_watcher);
ipmi_unregister_watchdog(watchdog_ifnum); ipmi_unregister_watchdog(watchdog_ifnum);
#ifdef HAVE_NMI_HANDLER #ifdef HAVE_DIE_NMI
if (nmi_handler_registered) if (nmi_handler_registered)
release_nmi(&ipmi_nmi_handler); unregister_die_notifier(&ipmi_nmi_handler);
#endif #endif
atomic_notifier_chain_unregister(&panic_notifier_list, atomic_notifier_chain_unregister(&panic_notifier_list,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment