Commit caab36b5 authored by Ingo Molnar's avatar Ingo Molnar

Merge branch 'x86/mce2' into x86/core

parents a1413c89 73af76df
...@@ -783,6 +783,11 @@ config X86_MCE_AMD ...@@ -783,6 +783,11 @@ config X86_MCE_AMD
Additional support for AMD specific MCE features such as Additional support for AMD specific MCE features such as
the DRAM Error Threshold. the DRAM Error Threshold.
config X86_MCE_THRESHOLD
depends on X86_MCE_AMD || X86_MCE_INTEL
bool
default y
config X86_MCE_NONFATAL config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_32 && X86_MCE depends on X86_32 && X86_MCE
......
...@@ -53,6 +53,7 @@ ...@@ -53,6 +53,7 @@
#define APIC_ESR_SENDILL 0x00020 #define APIC_ESR_SENDILL 0x00020
#define APIC_ESR_RECVILL 0x00040 #define APIC_ESR_RECVILL 0x00040
#define APIC_ESR_ILLREGA 0x00080 #define APIC_ESR_ILLREGA 0x00080
#define APIC_LVTCMCI 0x2f0
#define APIC_ICR 0x300 #define APIC_ICR 0x300
#define APIC_DEST_SELF 0x40000 #define APIC_DEST_SELF 0x40000
#define APIC_DEST_ALLINC 0x80000 #define APIC_DEST_ALLINC 0x80000
......
...@@ -11,6 +11,8 @@ ...@@ -11,6 +11,8 @@
*/ */
#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
...@@ -90,14 +92,29 @@ extern int mce_disabled; ...@@ -90,14 +92,29 @@ extern int mce_disabled;
#include <asm/atomic.h> #include <asm/atomic.h>
void mce_setup(struct mce *m);
void mce_log(struct mce *m); void mce_log(struct mce *m);
DECLARE_PER_CPU(struct sys_device, device_mce); DECLARE_PER_CPU(struct sys_device, device_mce);
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
/*
* To support more than 128 would need to escape the predefined
* Linux defined extended banks first.
*/
#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
#ifdef CONFIG_X86_MCE_INTEL #ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c); void mce_intel_feature_init(struct cpuinfo_x86 *c);
void cmci_clear(void);
void cmci_reenable(void);
void cmci_rediscover(int dying);
void cmci_recheck(void);
#else #else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
static inline void cmci_rediscover(int dying) {}
static inline void cmci_recheck(void) {}
#endif #endif
#ifdef CONFIG_X86_MCE_AMD #ifdef CONFIG_X86_MCE_AMD
...@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); ...@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif #endif
void mce_log_therm_throt_event(unsigned int cpu, __u64 status); extern int mce_available(struct cpuinfo_x86 *c);
void mce_log_therm_throt_event(__u64 status);
extern atomic_t mce_entry; extern atomic_t mce_entry;
extern void do_machine_check(struct pt_regs *, long); extern void do_machine_check(struct pt_regs *, long);
typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
enum mcp_flags {
MCP_TIMESTAMP = (1 << 0), /* log time stamp */
MCP_UC = (1 << 1), /* log uncorrected errors */
};
extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
extern int mce_notify_user(void); extern int mce_notify_user(void);
#endif /* !CONFIG_X86_32 */ #endif /* !CONFIG_X86_32 */
...@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c); ...@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
#else #else
#define mcheck_init(c) do { } while (0) #define mcheck_init(c) do { } while (0)
#endif #endif
extern void stop_mce(void);
extern void restart_mce(void); extern void (*mce_threshold_vector)(void);
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */ #endif /* _ASM_X86_MCE_H */
...@@ -77,6 +77,11 @@ ...@@ -77,6 +77,11 @@
#define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_ADDR 0x00000402
#define MSR_IA32_MC0_MISC 0x00000403 #define MSR_IA32_MC0_MISC 0x00000403
/* These are consecutive and not in the normal 4er MCE bank block */
#define MSR_IA32_MC0_CTL2 0x00000280
#define CMCI_EN (1ULL << 30)
#define CMCI_THRESHOLD_MASK 0xffffULL
#define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186 #define MSR_P6_EVNTSEL0 0x00000186
......
...@@ -414,9 +414,17 @@ void __init alternative_instructions(void) ...@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
that might execute the to be patched code. that might execute the to be patched code.
Other CPUs are not running. */ Other CPUs are not running. */
stop_nmi(); stop_nmi();
#ifdef CONFIG_X86_MCE
stop_mce(); /*
#endif * Don't stop machine check exceptions while patching.
* MCEs only happen when something got corrupted and in this
* case we must do something about the corruption.
* Ignoring it is worse than a unlikely patching race.
* Also machine checks tend to be broadcast and if one CPU
* goes into machine check the others follow quickly, so we don't
* expect a machine check to cause undue problems during to code
* patching.
*/
apply_alternatives(__alt_instructions, __alt_instructions_end); apply_alternatives(__alt_instructions, __alt_instructions_end);
...@@ -456,9 +464,6 @@ void __init alternative_instructions(void) ...@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
(unsigned long)__smp_locks_end); (unsigned long)__smp_locks_end);
restart_nmi(); restart_nmi();
#ifdef CONFIG_X86_MCE
restart_mce();
#endif
} }
/** /**
......
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include <asm/idle.h> #include <asm/idle.h>
#include <asm/mtrr.h> #include <asm/mtrr.h>
#include <asm/smp.h> #include <asm/smp.h>
#include <asm/mce.h>
unsigned int num_processors; unsigned int num_processors;
...@@ -842,6 +843,14 @@ void clear_local_APIC(void) ...@@ -842,6 +843,14 @@ void clear_local_APIC(void)
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
} }
#endif #endif
#ifdef CONFIG_X86_MCE_INTEL
if (maxlvt >= 6) {
v = apic_read(APIC_LVTCMCI);
if (!(v & APIC_LVT_MASKED))
apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
}
#endif
/* /*
* Clean APIC state for other OSs: * Clean APIC state for other OSs:
*/ */
...@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void) ...@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_LVT1, value); apic_write(APIC_LVT1, value);
preempt_enable(); preempt_enable();
#ifdef CONFIG_X86_MCE_INTEL
/* Recheck CMCI information after local APIC is up on CPU #0 */
if (smp_processor_id() == 0)
cmci_recheck();
#endif
} }
void __cpuinit end_local_APIC_setup(void) void __cpuinit end_local_APIC_setup(void)
......
...@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o ...@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
...@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) ...@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
} }
} }
static unsigned long old_cr4 __initdata;
void __init stop_mce(void)
{
old_cr4 = read_cr4();
clear_in_cr4(X86_CR4_MCE);
}
void __init restart_mce(void)
{
if (old_cr4 & X86_CR4_MCE)
set_in_cr4(X86_CR4_MCE);
}
static int __init mcheck_disable(char *str) static int __init mcheck_disable(char *str)
{ {
mce_disabled = 1; mce_disabled = 1;
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s). * Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it. * 2004 Andi Kleen. Rewrote most of it.
* Copyright 2008 Intel Corporation
* Author: Andi Kleen
*/ */
#include <linux/init.h> #include <linux/init.h>
...@@ -24,6 +26,9 @@ ...@@ -24,6 +26,9 @@
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/kmod.h> #include <linux/kmod.h>
#include <linux/kdebug.h> #include <linux/kdebug.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <linux/ratelimit.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/mce.h> #include <asm/mce.h>
...@@ -32,7 +37,6 @@ ...@@ -32,7 +37,6 @@
#include <asm/idle.h> #include <asm/idle.h>
#define MISC_MCELOG_MINOR 227 #define MISC_MCELOG_MINOR 227
#define NR_SYSFS_BANKS 6
atomic_t mce_entry; atomic_t mce_entry;
...@@ -47,7 +51,7 @@ static int mce_dont_init; ...@@ -47,7 +51,7 @@ static int mce_dont_init;
*/ */
static int tolerant = 1; static int tolerant = 1;
static int banks; static int banks;
static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; static u64 *bank;
static unsigned long notify_user; static unsigned long notify_user;
static int rip_msr; static int rip_msr;
static int mce_bootlog = -1; static int mce_bootlog = -1;
...@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL }; ...@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = smp_processor_id();
rdtscll(m->tsc);
}
/* /*
* Lockless MCE logging infrastructure. * Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also * This avoids deadlocks on printk locks without having to break locks. Also
...@@ -119,11 +136,11 @@ static void print_mce(struct mce *m) ...@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
print_symbol("{%s}", m->ip); print_symbol("{%s}", m->ip);
printk("\n"); printk("\n");
} }
printk(KERN_EMERG "TSC %Lx ", m->tsc); printk(KERN_EMERG "TSC %llx ", m->tsc);
if (m->addr) if (m->addr)
printk("ADDR %Lx ", m->addr); printk("ADDR %llx ", m->addr);
if (m->misc) if (m->misc)
printk("MISC %Lx ", m->misc); printk("MISC %llx ", m->misc);
printk("\n"); printk("\n");
printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "This is not a software problem!\n");
printk(KERN_EMERG "Run through mcelog --ascii to decode " printk(KERN_EMERG "Run through mcelog --ascii to decode "
...@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) ...@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
panic(msg); panic(msg);
} }
static int mce_available(struct cpuinfo_x86 *c) int mce_available(struct cpuinfo_x86 *c)
{ {
if (mce_dont_init)
return 0;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
} }
...@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) ...@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
} }
/* /*
* The actual machine check handler * Poll for corrected events or events that happened before reset.
* Those are just logged through /dev/mcelog.
*
* This is executed in standard interrupt context.
*/
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce m;
int i;
mce_setup(&m);
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
for (i = 0; i < banks; i++) {
if (!bank[i] || !test_bit(i, *b))
continue;
m.misc = 0;
m.addr = 0;
m.bank = i;
m.tsc = 0;
barrier();
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if (!(m.status & MCI_STATUS_VAL))
continue;
/*
* Uncorrected events are handled by the exception handler
* when it is enabled. But when the exception is disabled log
* everything.
*
* TBD do the same check for MCI_STATUS_EN here?
*/
if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
continue;
if (m.status & MCI_STATUS_MISCV)
rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
if (m.status & MCI_STATUS_ADDRV)
rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
if (!(flags & MCP_TIMESTAMP))
m.tsc = 0;
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
mce_log(&m);
add_taint(TAINT_MACHINE_CHECK);
/*
* Clear state for this bank.
*/
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
/*
* Don't clear MCG_STATUS here because it's only defined for
* exceptions.
*/
}
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
*
* This is executed in NMI context not subject to normal locking rules. This
* implies that most kernel services cannot be safely used. Don't even
* think about putting a printk in there!
*/ */
void do_machine_check(struct pt_regs * regs, long error_code) void do_machine_check(struct pt_regs * regs, long error_code)
{ {
...@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) ...@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* error. * error.
*/ */
int kill_it = 0; int kill_it = 0;
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
atomic_inc(&mce_entry); atomic_inc(&mce_entry);
if ((regs if (notify_die(DIE_NMI, "machine check", regs, error_code,
&& notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP) 18, SIGKILL) == NOTIFY_STOP)
|| !banks) goto out2;
if (!banks)
goto out2; goto out2;
memset(&m, 0, sizeof(struct mce)); mce_setup(&m);
m.cpu = smp_processor_id();
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
/* if the restart IP is not valid, we're done for */ /* if the restart IP is not valid, we're done for */
if (!(m.mcgstatus & MCG_STATUS_RIPV)) if (!(m.mcgstatus & MCG_STATUS_RIPV))
...@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code) ...@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
barrier(); barrier();
for (i = 0; i < banks; i++) { for (i = 0; i < banks; i++) {
if (i < NR_SYSFS_BANKS && !bank[i]) __clear_bit(i, toclear);
if (!bank[i])
continue; continue;
m.misc = 0; m.misc = 0;
m.addr = 0; m.addr = 0;
m.bank = i; m.bank = i;
m.tsc = 0;
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if ((m.status & MCI_STATUS_VAL) == 0) if ((m.status & MCI_STATUS_VAL) == 0)
continue; continue;
/*
* Non uncorrected errors are handled by machine_check_poll
* Leave them alone.
*/
if ((m.status & MCI_STATUS_UC) == 0)
continue;
/*
* Set taint even when machine check was not enabled.
*/
add_taint(TAINT_MACHINE_CHECK);
__set_bit(i, toclear);
if (m.status & MCI_STATUS_EN) { if (m.status & MCI_STATUS_EN) {
/* if PCC was set, there's no way out */ /* if PCC was set, there's no way out */
no_way_out |= !!(m.status & MCI_STATUS_PCC); no_way_out |= !!(m.status & MCI_STATUS_PCC);
...@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) ...@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
no_way_out = 1; no_way_out = 1;
kill_it = 1; kill_it = 1;
} }
} else {
/*
* Machine check event was not enabled. Clear, but
* ignore.
*/
continue;
} }
if (m.status & MCI_STATUS_MISCV) if (m.status & MCI_STATUS_MISCV)
...@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) ...@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
mce_get_rip(&m, regs); mce_get_rip(&m, regs);
if (error_code >= 0) mce_log(&m);
rdtscll(m.tsc);
if (error_code != -2)
mce_log(&m);
/* Did this bank cause the exception? */ /* Did this bank cause the exception? */
/* Assume that the bank with uncorrectable errors did it, /* Assume that the bank with uncorrectable errors did it,
...@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) ...@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
panicm = m; panicm = m;
panicm_found = 1; panicm_found = 1;
} }
add_taint(TAINT_MACHINE_CHECK);
} }
/* Never do anything final in the polling timer */
if (!regs)
goto out;
/* If we didn't find an uncorrectable error, pick /* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */ the last one (shouldn't happen, just being safe). */
if (!panicm_found) if (!panicm_found)
...@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) ...@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
/* notify userspace ASAP */ /* notify userspace ASAP */
set_thread_flag(TIF_MCE_NOTIFY); set_thread_flag(TIF_MCE_NOTIFY);
out:
/* the last thing we do is clear state */ /* the last thing we do is clear state */
for (i = 0; i < banks; i++) for (i = 0; i < banks; i++) {
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); if (test_bit(i, toclear))
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
wrmsrl(MSR_IA32_MCG_STATUS, 0); wrmsrl(MSR_IA32_MCG_STATUS, 0);
out2: out2:
atomic_dec(&mce_entry); atomic_dec(&mce_entry);
...@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) ...@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* and historically has been the register value of the * and historically has been the register value of the
* MSR_IA32_THERMAL_STATUS (Intel) msr. * MSR_IA32_THERMAL_STATUS (Intel) msr.
*/ */
void mce_log_therm_throt_event(unsigned int cpu, __u64 status) void mce_log_therm_throt_event(__u64 status)
{ {
struct mce m; struct mce m;
memset(&m, 0, sizeof(m)); mce_setup(&m);
m.cpu = cpu;
m.bank = MCE_THERMAL_BANK; m.bank = MCE_THERMAL_BANK;
m.status = status; m.status = status;
rdtscll(m.tsc);
mce_log(&m); mce_log(&m);
} }
#endif /* CONFIG_X86_MCE_INTEL */ #endif /* CONFIG_X86_MCE_INTEL */
...@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) ...@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
static int check_interval = 5 * 60; /* 5 minutes */ static int check_interval = 5 * 60; /* 5 minutes */
static int next_interval; /* in jiffies */ static int next_interval; /* in jiffies */
static void mcheck_timer(struct work_struct *work); static void mcheck_timer(unsigned long);
static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); static DEFINE_PER_CPU(struct timer_list, mce_timer);
static void mcheck_check_cpu(void *info) static void mcheck_timer(unsigned long data)
{ {
if (mce_available(&current_cpu_data)) struct timer_list *t = &per_cpu(mce_timer, data);
do_machine_check(NULL, 0);
}
static void mcheck_timer(struct work_struct *work) WARN_ON(smp_processor_id() != data);
{
on_each_cpu(mcheck_check_cpu, NULL, 1); if (mce_available(&current_cpu_data))
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
/* /*
* Alert userspace if needed. If we logged an MCE, reduce the * Alert userspace if needed. If we logged an MCE, reduce the
...@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work) ...@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
(int)round_jiffies_relative(check_interval*HZ)); (int)round_jiffies_relative(check_interval*HZ));
} }
schedule_delayed_work(&mcheck_work, next_interval); t->expires = jiffies + next_interval;
add_timer(t);
}
static void mce_do_trigger(struct work_struct *work)
{
call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
} }
static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
/* /*
* This is only called from process context. This is where we do * Notify the user(s) about new machine check events.
* anything we need to alert userspace about new MCEs. This is called * Can be called from interrupt context, but not from machine check/NMI
* directly from the poller and also from entry.S and idle, thanks to * context.
* TIF_MCE_NOTIFY.
*/ */
int mce_notify_user(void) int mce_notify_user(void)
{ {
/* Not more than two messages every minute */
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
clear_thread_flag(TIF_MCE_NOTIFY); clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &notify_user)) { if (test_and_clear_bit(0, &notify_user)) {
static unsigned long last_print;
unsigned long now = jiffies;
wake_up_interruptible(&mce_wait); wake_up_interruptible(&mce_wait);
if (trigger[0])
call_usermodehelper(trigger, trigger_argv, NULL,
UMH_NO_WAIT);
if (time_after_eq(now, last_print + (check_interval*HZ))) { /*
last_print = now; * There is no risk of missing notifications because
* work_pending is always cleared before the function is
* executed.
*/
if (trigger[0] && !work_pending(&mce_trigger_work))
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit))
printk(KERN_INFO "Machine check events logged\n"); printk(KERN_INFO "Machine check events logged\n");
}
return 1; return 1;
} }
...@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = { ...@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
static __init int periodic_mcheck_init(void) static __init int periodic_mcheck_init(void)
{ {
next_interval = check_interval * HZ; idle_notifier_register(&mce_idle_notifier);
if (next_interval) return 0;
schedule_delayed_work(&mcheck_work,
round_jiffies_relative(next_interval));
idle_notifier_register(&mce_idle_notifier);
return 0;
} }
__initcall(periodic_mcheck_init); __initcall(periodic_mcheck_init);
/* /*
* Initialize Machine Checks for a CPU. * Initialize Machine Checks for a CPU.
*/ */
static void mce_init(void *dummy) static int mce_cap_init(void)
{ {
u64 cap; u64 cap;
int i; unsigned b;
rdmsrl(MSR_IA32_MCG_CAP, cap); rdmsrl(MSR_IA32_MCG_CAP, cap);
banks = cap & 0xff; b = cap & 0xff;
if (banks > MCE_EXTENDED_BANK) { if (b > MAX_NR_BANKS) {
banks = MCE_EXTENDED_BANK; printk(KERN_WARNING
printk(KERN_INFO "MCE: warning: using only %d banks\n", "MCE: Using only %u machine check banks out of %u\n",
MCE_EXTENDED_BANK); MAX_NR_BANKS, b);
b = MAX_NR_BANKS;
} }
/* Don't support asymmetric configurations today */
WARN_ON(banks != 0 && b != banks);
banks = b;
if (!bank) {
bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
if (!bank)
return -ENOMEM;
memset(bank, 0xff, banks * sizeof(u64));
}
/* Use accurate RIP reporting if available. */ /* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
rip_msr = MSR_IA32_MCG_EIP; rip_msr = MSR_IA32_MCG_EIP;
/* Log the machine checks left over from the previous reset. return 0;
This also clears all registers */ }
do_machine_check(NULL, mce_bootlog ? -1 : -2);
static void mce_init(void *dummy)
{
u64 cap;
int i;
mce_banks_t all_banks;
/*
* Log the machine checks left over from the previous reset.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC, &all_banks);
set_in_cr4(X86_CR4_MCE); set_in_cr4(X86_CR4_MCE);
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P) if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) { for (i = 0; i < banks; i++) {
if (i < NR_SYSFS_BANKS) wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
else
wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
} }
} }
/* Add per CPU specific workarounds here */ /* Add per CPU specific workarounds here */
static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) static void mce_cpu_quirks(struct cpuinfo_x86 *c)
{ {
/* This should be disabled by the BIOS, but isn't always */ /* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) { if (c->x86_vendor == X86_VENDOR_AMD) {
if(c->x86 == 15) if (c->x86 == 15 && banks > 4)
/* disable GART TBL walk error reporting, which trips off /* disable GART TBL walk error reporting, which trips off
incorrectly with the IOMMU & 3ware & Cerberus. */ incorrectly with the IOMMU & 3ware & Cerberus. */
clear_bit(10, &bank[4]); clear_bit(10, (unsigned long *)&bank[4]);
if(c->x86 <= 17 && mce_bootlog < 0) if(c->x86 <= 17 && mce_bootlog < 0)
/* Lots of broken BIOS around that don't clear them /* Lots of broken BIOS around that don't clear them
by default and leave crap in there. Don't log. */ by default and leave crap in there. Don't log. */
...@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) ...@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
} }
} }
static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
/* data race harmless because everyone sets to the same value */
if (!next_interval)
next_interval = check_interval * HZ;
if (!next_interval)
return;
setup_timer(t, mcheck_timer, smp_processor_id());
t->expires = round_jiffies_relative(jiffies + next_interval);
add_timer(t);
}
/* /*
* Called for each booted CPU to set up machine checks. * Called for each booted CPU to set up machine checks.
* Must be called with preempt off. * Must be called with preempt off.
*/ */
void __cpuinit mcheck_init(struct cpuinfo_x86 *c) void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{ {
mce_cpu_quirks(c); if (!mce_available(c))
return;
if (mce_dont_init || if (mce_cap_init() < 0) {
!mce_available(c)) mce_dont_init = 1;
return; return;
}
mce_cpu_quirks(c);
mce_init(NULL); mce_init(NULL);
mce_cpu_features(c); mce_cpu_features(c);
mce_init_timer();
} }
/* /*
...@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, ...@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
{ {
unsigned long *cpu_tsc; unsigned long *cpu_tsc;
static DEFINE_MUTEX(mce_read_mutex); static DEFINE_MUTEX(mce_read_mutex);
unsigned next; unsigned prev, next;
char __user *buf = ubuf; char __user *buf = ubuf;
int i, err; int i, err;
...@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, ...@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
} }
err = 0; err = 0;
for (i = 0; i < next; i++) { prev = 0;
unsigned long start = jiffies; do {
for (i = prev; i < next; i++) {
while (!mcelog.entry[i].finished) { unsigned long start = jiffies;
if (time_after_eq(jiffies, start + 2)) {
memset(mcelog.entry + i,0, sizeof(struct mce)); while (!mcelog.entry[i].finished) {
goto timeout; if (time_after_eq(jiffies, start + 2)) {
memset(mcelog.entry + i, 0,
sizeof(struct mce));
goto timeout;
}
cpu_relax();
} }
cpu_relax(); smp_rmb();
err |= copy_to_user(buf, mcelog.entry + i,
sizeof(struct mce));
buf += sizeof(struct mce);
timeout:
;
} }
smp_rmb();
err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
buf += sizeof(struct mce);
timeout:
;
}
memset(mcelog.entry, 0, next * sizeof(struct mce)); memset(mcelog.entry + prev, 0,
mcelog.next = 0; (next - prev) * sizeof(struct mce));
prev = next;
next = cmpxchg(&mcelog.next, prev, 0);
} while (next != prev);
synchronize_sched(); synchronize_sched();
...@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = { ...@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
&mce_chrdev_ops, &mce_chrdev_ops,
}; };
static unsigned long old_cr4 __initdata;
void __init stop_mce(void)
{
old_cr4 = read_cr4();
clear_in_cr4(X86_CR4_MCE);
}
void __init restart_mce(void)
{
if (old_cr4 & X86_CR4_MCE)
set_in_cr4(X86_CR4_MCE);
}
/* /*
* Old style boot options parsing. Only for compatibility. * Old style boot options parsing. Only for compatibility.
*/ */
...@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str) ...@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
return 1; return 1;
} }
/* mce=off disables machine check. Note you can re-enable it later /* mce=off disables machine check.
using sysfs.
mce=TOLERANCELEVEL (number, see above) mce=TOLERANCELEVEL (number, see above)
mce=bootlog Log MCEs from before booting. Disabled by default on AMD. mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
mce=nobootlog Don't log MCEs from before booting. */ mce=nobootlog Don't log MCEs from before booting. */
...@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable); ...@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
* Sysfs support * Sysfs support
*/ */
/*
* Disable machine checks on suspend and shutdown. We can't really handle
* them later.
*/
static int mce_disable(void)
{
int i;
for (i = 0; i < banks; i++)
wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
return 0;
}
static int mce_suspend(struct sys_device *dev, pm_message_t state)
{
return mce_disable();
}
static int mce_shutdown(struct sys_device *dev)
{
return mce_disable();
}
/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
Only one CPU is active at this time, the others get readded later using Only one CPU is active at this time, the others get readded later using
CPU hotplug. */ CPU hotplug. */
...@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev) ...@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
return 0; return 0;
} }
static void mce_cpu_restart(void *data)
{
del_timer_sync(&__get_cpu_var(mce_timer));
if (mce_available(&current_cpu_data))
mce_init(NULL);
mce_init_timer();
}
/* Reinit MCEs after user configuration changes */ /* Reinit MCEs after user configuration changes */
static void mce_restart(void) static void mce_restart(void)
{ {
if (next_interval)
cancel_delayed_work(&mcheck_work);
/* Timer race is harmless here */
on_each_cpu(mce_init, NULL, 1);
next_interval = check_interval * HZ; next_interval = check_interval * HZ;
if (next_interval) on_each_cpu(mce_cpu_restart, NULL, 1);
schedule_delayed_work(&mcheck_work,
round_jiffies_relative(next_interval));
} }
static struct sysdev_class mce_sysclass = { static struct sysdev_class mce_sysclass = {
.suspend = mce_suspend,
.shutdown = mce_shutdown,
.resume = mce_resume, .resume = mce_resume,
.name = "machinecheck", .name = "machinecheck",
}; };
...@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit ...@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
} \ } \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
/* static struct sysdev_attribute *bank_attrs;
* TBD should generate these dynamically based on number of available banks.
* Have only 6 contol banks in /sysfs until then. static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
*/ char *buf)
ACCESSOR(bank0ctl,bank[0],mce_restart()) {
ACCESSOR(bank1ctl,bank[1],mce_restart()) u64 b = bank[attr - bank_attrs];
ACCESSOR(bank2ctl,bank[2],mce_restart()) return sprintf(buf, "%llx\n", b);
ACCESSOR(bank3ctl,bank[3],mce_restart()) }
ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(bank5ctl,bank[5],mce_restart()) static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
const char *buf, size_t siz)
{
char *end;
u64 new = simple_strtoull(buf, &end, 0);
if (end == buf)
return -EINVAL;
bank[attr - bank_attrs] = new;
mce_restart();
return end-buf;
}
static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
char *buf) char *buf)
...@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); ...@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
ACCESSOR(check_interval,check_interval,mce_restart()) ACCESSOR(check_interval,check_interval,mce_restart())
static struct sysdev_attribute *mce_attributes[] = { static struct sysdev_attribute *mce_attributes[] = {
&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
&attr_tolerant.attr, &attr_check_interval, &attr_trigger, &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
NULL NULL
}; };
...@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) ...@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
if (err) if (err)
goto error; goto error;
} }
for (i = 0; i < banks; i++) {
err = sysdev_create_file(&per_cpu(device_mce, cpu),
&bank_attrs[i]);
if (err)
goto error2;
}
cpu_set(cpu, mce_device_initialized); cpu_set(cpu, mce_device_initialized);
return 0; return 0;
error2:
while (--i >= 0) {
sysdev_remove_file(&per_cpu(device_mce, cpu),
&bank_attrs[i]);
}
error: error:
while (i--) { while (--i >= 0) {
sysdev_remove_file(&per_cpu(device_mce,cpu), sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]); mce_attributes[i]);
} }
...@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu) ...@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
for (i = 0; mce_attributes[i]; i++) for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu), sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]); mce_attributes[i]);
for (i = 0; i < banks; i++)
sysdev_remove_file(&per_cpu(device_mce, cpu),
&bank_attrs[i]);
sysdev_unregister(&per_cpu(device_mce,cpu)); sysdev_unregister(&per_cpu(device_mce,cpu));
cpu_clear(cpu, mce_device_initialized); cpu_clear(cpu, mce_device_initialized);
} }
/* Make sure there are no machine checks on offlined CPUs. */
static void mce_disable_cpu(void *h)
{
int i;
unsigned long action = *(unsigned long *)h;
if (!mce_available(&current_cpu_data))
return;
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
for (i = 0; i < banks; i++)
wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
}
static void mce_reenable_cpu(void *h)
{
int i;
unsigned long action = *(unsigned long *)h;
if (!mce_available(&current_cpu_data))
return;
if (!(action & CPU_TASKS_FROZEN))
cmci_reenable();
for (i = 0; i < banks; i++)
wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */ /* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu) unsigned long action, void *hcpu)
{ {
unsigned int cpu = (unsigned long)hcpu; unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);
switch (action) { switch (action) {
case CPU_ONLINE: case CPU_ONLINE:
...@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, ...@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
threshold_cpu_callback(action, cpu); threshold_cpu_callback(action, cpu);
mce_remove_device(cpu); mce_remove_device(cpu);
break; break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
t->expires = round_jiffies_relative(jiffies + next_interval);
add_timer_on(t, cpu);
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
case CPU_POST_DEAD:
/* intentionally ignoring frozen here */
cmci_rediscover(cpu);
break;
} }
return NOTIFY_OK; return NOTIFY_OK;
} }
...@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { ...@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
.notifier_call = mce_cpu_callback, .notifier_call = mce_cpu_callback,
}; };
static __init int mce_init_banks(void)
{
int i;
bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
GFP_KERNEL);
if (!bank_attrs)
return -ENOMEM;
for (i = 0; i < banks; i++) {
struct sysdev_attribute *a = &bank_attrs[i];
a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
if (!a->attr.name)
goto nomem;
a->attr.mode = 0644;
a->show = show_bank;
a->store = set_bank;
}
return 0;
nomem:
while (--i >= 0)
kfree(bank_attrs[i].attr.name);
kfree(bank_attrs);
bank_attrs = NULL;
return -ENOMEM;
}
static __init int mce_init_device(void) static __init int mce_init_device(void)
{ {
int err; int err;
...@@ -906,6 +1161,11 @@ static __init int mce_init_device(void) ...@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
if (!mce_available(&boot_cpu_data)) if (!mce_available(&boot_cpu_data))
return -EIO; return -EIO;
err = mce_init_banks();
if (err)
return err;
err = sysdev_class_register(&mce_sysclass); err = sysdev_class_register(&mce_sysclass);
if (err) if (err)
return err; return err;
......
...@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { ...@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
/* /*
* CPU Initialization * CPU Initialization
*/ */
...@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) ...@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
tr.reset = 0; tr.reset = 0;
tr.old_limit = 0; tr.old_limit = 0;
threshold_restart_bank(&tr); threshold_restart_bank(&tr);
mce_threshold_vector = amd_threshold_interrupt;
} }
} }
} }
...@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) ...@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
* the interrupt goes off when error_count reaches threshold_limit. * the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number. * the handler will simply log mcelog w/ software defined bank number.
*/ */
asmlinkage void mce_threshold_interrupt(void) static void amd_threshold_interrupt(void)
{ {
unsigned int bank, block; unsigned int bank, block;
struct mce m; struct mce m;
u32 low = 0, high = 0, address = 0; u32 low = 0, high = 0, address = 0;
ack_APIC_irq(); mce_setup(&m);
exit_idle();
irq_enter();
memset(&m, 0, sizeof(m));
rdtscll(m.tsc);
m.cpu = smp_processor_id();
/* assume first bank caused it */ /* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) { for (bank = 0; bank < NR_BANKS; ++bank) {
...@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void) ...@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
/* Log the machine check that caused the threshold /* Log the machine check that caused the threshold
event. */ event. */
do_machine_check(NULL, 0); machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
if (high & MASK_OVERFLOW_HI) { if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc); rdmsrl(address, m.misc);
...@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void) ...@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
+ bank * NR_BLOCKS + bank * NR_BLOCKS
+ block; + block;
mce_log(&m); mce_log(&m);
goto out; return;
} }
} }
} }
out:
inc_irq_stat(irq_threshold_count);
irq_exit();
} }
/* /*
......
/* /*
* Intel specific MCE features. * Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
* Copyright (C) 2008, 2009 Intel Corporation
* Author: Andi Kleen
*/ */
#include <linux/init.h> #include <linux/init.h>
...@@ -13,6 +15,7 @@ ...@@ -13,6 +15,7 @@
#include <asm/hw_irq.h> #include <asm/hw_irq.h>
#include <asm/idle.h> #include <asm/idle.h>
#include <asm/therm_throt.h> #include <asm/therm_throt.h>
#include <asm/apic.h>
asmlinkage void smp_thermal_interrupt(void) asmlinkage void smp_thermal_interrupt(void)
{ {
...@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void) ...@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
rdmsrl(MSR_IA32_THERM_STATUS, msr_val); rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & 1)) if (therm_throt_process(msr_val & 1))
mce_log_therm_throt_event(smp_processor_id(), msr_val); mce_log_therm_throt_event(msr_val);
inc_irq_stat(irq_thermal_count); inc_irq_stat(irq_thermal_count);
irq_exit(); irq_exit();
...@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) ...@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
return; return;
} }
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
* Normally we pick those up using a regular polling timer.
* Also supports reliable discovery of shared banks.
*/
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
static DEFINE_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
static int cmci_supported(int *banks)
{
u64 cap;
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
if (!cpu_has_apic || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
return !!(cap & MCG_CMCI_P);
}
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
* This could in theory increase the threshold under high load,
* but doesn't for now.
*/
static void intel_threshold_interrupt(void)
{
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
mce_notify_user();
}
static void print_update(char *type, int *hdr, int num)
{
if (*hdr == 0)
printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
*hdr = 1;
printk(KERN_CONT " %s:%d", type, num);
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
* banks.
*/
static void cmci_discover(int banks, int boot)
{
unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
int hdr = 0;
int i;
spin_lock(&cmci_discover_lock);
for (i = 0; i < banks; i++) {
u64 val;
if (test_bit(i, owned))
continue;
rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
/* Already owned by someone else? */
if (val & CMCI_EN) {
if (test_and_clear_bit(i, owned) || boot)
print_update("SHD", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
val |= CMCI_EN | CMCI_THRESHOLD;
wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & CMCI_EN) {
if (!test_and_set_bit(i, owned) || boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
spin_unlock(&cmci_discover_lock);
if (hdr)
printk(KERN_CONT "\n");
}
/*
* Just in case we missed an event during initialization check
* all the CMCI owned banks.
*/
void cmci_recheck(void)
{
unsigned long flags;
int banks;
if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
return;
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
local_irq_restore(flags);
}
/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
*/
void cmci_clear(void)
{
int i;
int banks;
u64 val;
if (!cmci_supported(&banks))
return;
spin_lock(&cmci_discover_lock);
for (i = 0; i < banks; i++) {
if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
spin_unlock(&cmci_discover_lock);
}
/*
* After a CPU went down cycle through all the others and rediscover
* Must run in process context.
*/
void cmci_rediscover(int dying)
{
int banks;
int cpu;
cpumask_var_t old;
if (!cmci_supported(&banks))
return;
if (!alloc_cpumask_var(&old, GFP_KERNEL))
return;
cpumask_copy(old, &current->cpus_allowed);
for_each_online_cpu (cpu) {
if (cpu == dying)
continue;
if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
continue;
/* Recheck banks in case CPUs don't all have the same */
if (cmci_supported(&banks))
cmci_discover(banks, 0);
}
set_cpus_allowed_ptr(current, old);
free_cpumask_var(old);
}
/*
* Reenable CMCI on this CPU in case a CPU down failed.
*/
void cmci_reenable(void)
{
int banks;
if (cmci_supported(&banks))
cmci_discover(banks, 0);
}
static __cpuinit void intel_init_cmci(void)
{
int banks;
if (!cmci_supported(&banks))
return;
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks, 1);
/*
* For CPU #0 this runs with still disabled APIC, but that's
* ok because only the vector is set up. We still do another
* check for the banks later for CPU #0 just to make sure
* to not miss any events.
*/
apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
cmci_recheck();
}
void mce_intel_feature_init(struct cpuinfo_x86 *c) void mce_intel_feature_init(struct cpuinfo_x86 *c)
{ {
intel_init_thermal(c); intel_init_thermal(c);
intel_init_cmci();
} }
/*
* Common corrected MCE threshold handler code:
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <asm/irq_vectors.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
static void default_threshold_interrupt(void)
{
printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
THRESHOLD_APIC_VECTOR);
}
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
asmlinkage void mce_threshold_interrupt(void)
{
exit_idle();
irq_enter();
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
irq_exit();
/* Ack only at the end to avoid potential reentry */
ack_APIC_irq();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment