Commit 8446f1d3 authored by Ingo Molnar's avatar Ingo Molnar Committed by Linus Torvalds

[PATCH] detect soft lockups

This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarNishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: default avatarMatthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: default avatarRichard Purdie <rpurdie@rpsys.net>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 4732efbe
...@@ -478,6 +478,11 @@ void touch_nmi_watchdog (void) ...@@ -478,6 +478,11 @@ void touch_nmi_watchdog (void)
*/ */
for (i = 0; i < NR_CPUS; i++) for (i = 0; i < NR_CPUS; i++)
alert_counter[i] = 0; alert_counter[i] = 0;
/*
* Tickle the softlockup detector too:
*/
touch_softlockup_watchdog();
} }
extern void die_nmi(struct pt_regs *, const char *msg); extern void die_nmi(struct pt_regs *, const char *msg);
......
...@@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev) ...@@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev)
last_timer->resume(); last_timer->resume();
cur_timer = last_timer; cur_timer = last_timer;
last_timer = NULL; last_timer = NULL;
touch_softlockup_watchdog();
return 0; return 0;
} }
......
...@@ -463,6 +463,8 @@ void touch_nmi_watchdog (void) ...@@ -463,6 +463,8 @@ void touch_nmi_watchdog (void)
*/ */
for (i = 0; i < NR_CPUS; i++) for (i = 0; i < NR_CPUS; i++)
per_cpu(nmi_touch, i) = 1; per_cpu(nmi_touch, i) = 1;
touch_softlockup_watchdog();
} }
void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
......
...@@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev) ...@@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev)
write_sequnlock_irqrestore(&xtime_lock,flags); write_sequnlock_irqrestore(&xtime_lock,flags);
jiffies += sleep_length; jiffies += sleep_length;
wall_jiffies += sleep_length; wall_jiffies += sleep_length;
touch_softlockup_watchdog();
return 0; return 0;
} }
......
...@@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd) ...@@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd)
do { do {
if (this->dev_ready(mtd)) if (this->dev_ready(mtd))
return; return;
touch_softlockup_watchdog();
} while (time_before(jiffies, timeo)); } while (time_before(jiffies, timeo));
} }
......
...@@ -176,6 +176,23 @@ extern void trap_init(void); ...@@ -176,6 +176,23 @@ extern void trap_init(void);
extern void update_process_times(int user); extern void update_process_times(int user);
extern void scheduler_tick(void); extern void scheduler_tick(void);
#ifdef CONFIG_DETECT_SOFTLOCKUP
extern void softlockup_tick(struct pt_regs *regs);
extern void spawn_softlockup_task(void);
extern void touch_softlockup_watchdog(void);
#else
static inline void softlockup_tick(struct pt_regs *regs)
{
}
static inline void spawn_softlockup_task(void)
{
}
static inline void touch_softlockup_watchdog(void)
{
}
#endif
/* Attach to any functions which should be ignored in wchan output. */ /* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text"))) #define __sched __attribute__((__section__(".sched.text")))
/* Is this address in the __sched functions? */ /* Is this address in the __sched functions? */
......
...@@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void) ...@@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void)
migration_init(); migration_init();
#endif #endif
spawn_ksoftirqd(); spawn_ksoftirqd();
spawn_softlockup_task();
} }
static void run_init_process(char *init_filename) static void run_init_process(char *init_filename)
......
...@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o ...@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_SYSFS) += ksysfs.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_SECCOMP) += seccomp.o
......
...@@ -1059,6 +1059,7 @@ int swsusp_resume(void) ...@@ -1059,6 +1059,7 @@ int swsusp_resume(void)
BUG_ON(!error); BUG_ON(!error);
restore_processor_state(); restore_processor_state();
restore_highmem(); restore_highmem();
touch_softlockup_watchdog();
device_power_up(); device_power_up();
local_irq_enable(); local_irq_enable();
return error; return error;
......
/*
* Detect Soft Lockups
*
* started by Ingo Molnar, (C) 2005, Red Hat
*
* this code detects soft lockups: incidents in where on a CPU
* the kernel does not reschedule for 10 seconds or more.
*/
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/notifier.h>
#include <linux/module.h>
static DEFINE_SPINLOCK(print_lock);
static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
static int did_panic = 0;
static int softlock_panic(struct notifier_block *this, unsigned long event,
void *ptr)
{
did_panic = 1;
return NOTIFY_DONE;
}
static struct notifier_block panic_block = {
.notifier_call = softlock_panic,
};
void touch_softlockup_watchdog(void)
{
per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
*/
void softlockup_tick(struct pt_regs *regs)
{
int this_cpu = smp_processor_id();
unsigned long timestamp = per_cpu(timestamp, this_cpu);
if (per_cpu(print_timestamp, this_cpu) == timestamp)
return;
/* Do not cause a second panic when there already was one */
if (did_panic)
return;
if (time_after(jiffies, timestamp + 10*HZ)) {
per_cpu(print_timestamp, this_cpu) = timestamp;
spin_lock(&print_lock);
printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
this_cpu);
show_regs(regs);
spin_unlock(&print_lock);
}
}
/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void * __bind_cpu)
{
struct sched_param param = { .sched_priority = 99 };
int this_cpu = (long) __bind_cpu;
printk("softlockup thread %d started up.\n", this_cpu);
sched_setscheduler(current, SCHED_FIFO, &param);
current->flags |= PF_NOFREEZE;
set_current_state(TASK_INTERRUPTIBLE);
/*
* Run briefly once per second - if this gets delayed for
* more than 10 seconds then the debug-printout triggers
* in softlockup_tick():
*/
while (!kthread_should_stop()) {
msleep_interruptible(1000);
touch_softlockup_watchdog();
}
__set_current_state(TASK_RUNNING);
return 0;
}
/*
* Create/destroy watchdog threads as CPUs come and go:
*/
static int __devinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
struct task_struct *p;
switch (action) {
case CPU_UP_PREPARE:
BUG_ON(per_cpu(watchdog_task, hotcpu));
p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
if (IS_ERR(p)) {
printk("watchdog for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
per_cpu(watchdog_task, hotcpu) = p;
kthread_bind(p, hotcpu);
break;
case CPU_ONLINE:
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
case CPU_DEAD:
p = per_cpu(watchdog_task, hotcpu);
per_cpu(watchdog_task, hotcpu) = NULL;
kthread_stop(p);
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
return NOTIFY_OK;
}
static struct notifier_block __devinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
__init void spawn_softlockup_task(void)
{
void *cpu = (void *)(long)smp_processor_id();
cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
notifier_chain_register(&panic_notifier_list, &panic_block);
}
...@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs) ...@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
{ {
jiffies_64++; jiffies_64++;
update_times(); update_times();
softlockup_tick(regs);
} }
#ifdef __ARCH_WANT_SYS_ALARM #ifdef __ARCH_WANT_SYS_ALARM
......
...@@ -46,6 +46,25 @@ config LOG_BUF_SHIFT ...@@ -46,6 +46,25 @@ config LOG_BUF_SHIFT
13 => 8 KB 13 => 8 KB
12 => 4 KB 12 => 4 KB
config DETECT_SOFTLOCKUP
bool "Detect Soft Lockups"
depends on DEBUG_KERNEL
default y
help
Say Y here to enable the kernel to detect "soft lockups",
which are bugs that cause the kernel to loop in kernel
mode for more than 10 seconds, without giving other tasks a
chance to run.
When a soft-lockup is detected, the kernel will print the
current stack trace (which you should report), but the
system will stay locked up. This feature has negligible
overhead.
(Note that "hard lockups" are separate type of bugs that
can be detected via the NMI-watchdog, on platforms that
support it.)
config SCHEDSTATS config SCHEDSTATS
bool "Collect scheduler statistics" bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS depends on DEBUG_KERNEL && PROC_FS
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment