提交 75152114 编写于 作者: A Andi Kleen 提交者: Linus Torvalds

[PATCH] x86_64: Collected NMI watchdog fixes.

Collected NMI watchdog fixes.

- Fix call of check_nmi_watchdog

- Remove earlier move of check_nmi_watchdog to later.  It does not fix the
  race it was supposed to fix fully.

- Remove unused P6 definitions

- Add support for performance counter based watchdog on P4 systems.

  This allows to run it only once per second, which saves some CPU time.
  Previously it would run at 1000Hz, which was too much.

  Code ported from i386

  Make this the default on Intel systems.

- Use check_nmi_watchdog with local APIC based nmi

- Fix race in touch_nmi_watchdog

- Fix bug that caused incorrect performance counters to be programmed in a
  few cases on K8.

- Remove useless check for local APIC

- Use local_t and per_cpu variables for per CPU data.

- Keep other CPUs busy during check_nmi_watchdog to make sure they really
  tick when in lapic mode.

- Only check CPUs that are actually online.

- Various other fixes.

- Fix fallback path when MSRs are unimplemented
Signed-off-by: NAndi Kleen <ak@suse.de>
Signed-off-by: NAndrew Morton <akpm@osdl.org>
Signed-off-by: NLinus Torvalds <torvalds@osdl.org>
上级 f3c5f5e7
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <asm/mpspec.h> #include <asm/mpspec.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
#include <asm/mach_apic.h> #include <asm/mach_apic.h>
#include <asm/nmi.h>
int apic_verbosity; int apic_verbosity;
...@@ -1056,7 +1057,7 @@ int __init APIC_init_uniprocessor (void) ...@@ -1056,7 +1057,7 @@ int __init APIC_init_uniprocessor (void)
nr_ioapics = 0; nr_ioapics = 0;
#endif #endif
setup_boot_APIC_clock(); setup_boot_APIC_clock();
check_nmi_watchdog();
return 0; return 0;
} }
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/proto.h> #include <asm/proto.h>
#include <asm/kdebug.h> #include <asm/kdebug.h>
#include <asm/local.h>
/* /*
* lapic_nmi_owner tracks the ownership of the lapic NMI hardware: * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
...@@ -59,7 +60,8 @@ int panic_on_timeout; ...@@ -59,7 +60,8 @@ int panic_on_timeout;
unsigned int nmi_watchdog = NMI_DEFAULT; unsigned int nmi_watchdog = NMI_DEFAULT;
static unsigned int nmi_hz = HZ; static unsigned int nmi_hz = HZ;
unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
static unsigned int nmi_p4_cccr_val;
/* Note that these events don't tick when the CPU idles. This means /* Note that these events don't tick when the CPU idles. This means
the frequency varies with CPU load. */ the frequency varies with CPU load. */
...@@ -71,67 +73,87 @@ unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ ...@@ -71,67 +73,87 @@ unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
#define P6_EVNTSEL0_ENABLE (1 << 22) #define MSR_P4_MISC_ENABLE 0x1A0
#define P6_EVNTSEL_INT (1 << 20) #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
#define P6_EVNTSEL_OS (1 << 17) #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
#define P6_EVNTSEL_USR (1 << 16) #define MSR_P4_PERFCTR0 0x300
#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 #define MSR_P4_CCCR0 0x360
#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
#define P4_ESCR_OS (1<<3)
#define P4_ESCR_USR (1<<2)
#define P4_CCCR_OVF_PMI0 (1<<26)
#define P4_CCCR_OVF_PMI1 (1<<27)
#define P4_CCCR_THRESHOLD(N) ((N)<<20)
#define P4_CCCR_COMPLEMENT (1<<19)
#define P4_CCCR_COMPARE (1<<18)
#define P4_CCCR_REQUIRED (3<<16)
#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
#define P4_CCCR_ENABLE (1<<12)
/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
CRU_ESCR0 (with any non-null event selector) through a complemented
max threshold. [IA32-Vol3, Section 14.9.9] */
#define MSR_P4_IQ_COUNTER0 0x30C
#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
#define P4_NMI_IQ_CCCR0 \
(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
static __init inline int nmi_known_cpu(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
return boot_cpu_data.x86 == 15;
case X86_VENDOR_INTEL:
return boot_cpu_data.x86 == 15;
}
return 0;
}
/* Run after command line and cpu_init init, but before all other checks */ /* Run after command line and cpu_init init, but before all other checks */
void __init nmi_watchdog_default(void) void __init nmi_watchdog_default(void)
{ {
if (nmi_watchdog != NMI_DEFAULT) if (nmi_watchdog != NMI_DEFAULT)
return; return;
if (nmi_known_cpu())
/* For some reason the IO APIC watchdog doesn't work on the AMD
8111 chipset. For now switch to local APIC mode using
perfctr0 there. On Intel CPUs we don't have code to handle
the perfctr and the IO-APIC seems to work, so use that. */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
nmi_watchdog = NMI_LOCAL_APIC; nmi_watchdog = NMI_LOCAL_APIC;
printk(KERN_INFO else
"Using local APIC NMI watchdog using perfctr0\n");
} else {
printk(KERN_INFO "Using IO APIC NMI watchdog\n");
nmi_watchdog = NMI_IO_APIC; nmi_watchdog = NMI_IO_APIC;
}
} }
/* Why is there no CPUID flag for this? */ #ifdef CONFIG_SMP
static __init int cpu_has_lapic(void) /* The performance counters used by NMI_LOCAL_APIC don't trigger when
* the CPU is idle. To make sure the NMI watchdog really ticks on all
* CPUs during the test make them busy.
*/
static __init void nmi_cpu_busy(void *data)
{ {
switch (boot_cpu_data.x86_vendor) { volatile int *endflag = data;
case X86_VENDOR_INTEL: local_irq_enable();
case X86_VENDOR_AMD: /* Intentionally don't use cpu_relax here. This is
return boot_cpu_data.x86 >= 6; to make sure that the performance counter really ticks,
/* .... add more cpus here or find a different way to figure this out. */ even if there is a simulator or similar that catches the
default: pause instruction. On a real HT machine this is fine because
return 0; all other CPUs are busy with "useless" delay loops and don't
} care if they get somewhat less cycles. */
while (*endflag == 0)
barrier();
} }
#endif
static int __init check_nmi_watchdog (void) int __init check_nmi_watchdog (void)
{ {
volatile int endflag = 0;
int *counts; int *counts;
int cpu; int cpu;
if (nmi_watchdog == NMI_NONE) counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
return 0; if (!counts)
if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) {
nmi_watchdog = NMI_NONE;
return -1; return -1;
}
counts = kmalloc(NR_CPUS * sizeof(int),GFP_KERNEL); printk(KERN_INFO "testing NMI watchdog ... ");
if (!counts) {
nmi_watchdog = NMI_NONE;
return 0;
}
printk(KERN_INFO "Testing NMI watchdog ... "); if (nmi_watchdog == NMI_LOCAL_APIC)
smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
for (cpu = 0; cpu < NR_CPUS; cpu++) for (cpu = 0; cpu < NR_CPUS; cpu++)
counts[cpu] = cpu_pda[cpu].__nmi_count; counts[cpu] = cpu_pda[cpu].__nmi_count;
...@@ -139,16 +161,22 @@ static int __init check_nmi_watchdog (void) ...@@ -139,16 +161,22 @@ static int __init check_nmi_watchdog (void)
mdelay((10*1000)/nmi_hz); // wait 10 ticks mdelay((10*1000)/nmi_hz); // wait 10 ticks
for (cpu = 0; cpu < NR_CPUS; cpu++) { for (cpu = 0; cpu < NR_CPUS; cpu++) {
if (!cpu_online(cpu))
continue;
if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
printk("CPU#%d: NMI appears to be stuck (%d)!\n", endflag = 1;
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu, cpu,
counts[cpu],
cpu_pda[cpu].__nmi_count); cpu_pda[cpu].__nmi_count);
nmi_active = 0; nmi_active = 0;
lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
nmi_perfctr_msr = 0;
kfree(counts); kfree(counts);
return -1; return -1;
} }
} }
endflag = 1;
printk("OK.\n"); printk("OK.\n");
/* now that we know it works we can reduce NMI frequency to /* now that we know it works we can reduce NMI frequency to
...@@ -159,8 +187,6 @@ static int __init check_nmi_watchdog (void) ...@@ -159,8 +187,6 @@ static int __init check_nmi_watchdog (void)
kfree(counts); kfree(counts);
return 0; return 0;
} }
/* Have this called later during boot so counters are updating */
late_initcall(check_nmi_watchdog);
int __init setup_nmi_watchdog(char *str) int __init setup_nmi_watchdog(char *str)
{ {
...@@ -193,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void) ...@@ -193,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void)
wrmsr(MSR_K7_EVNTSEL0, 0, 0); wrmsr(MSR_K7_EVNTSEL0, 0, 0);
break; break;
case X86_VENDOR_INTEL: case X86_VENDOR_INTEL:
wrmsr(MSR_IA32_EVNTSEL0, 0, 0); if (boot_cpu_data.x86 == 15) {
wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
}
break; break;
} }
nmi_active = -1; nmi_active = -1;
...@@ -261,7 +290,7 @@ void enable_timer_nmi_watchdog(void) ...@@ -261,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
static int nmi_pm_active; /* nmi_active before suspend */ static int nmi_pm_active; /* nmi_active before suspend */
static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
{ {
nmi_pm_active = nmi_active; nmi_pm_active = nmi_active;
disable_lapic_nmi_watchdog(); disable_lapic_nmi_watchdog();
...@@ -308,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs); ...@@ -308,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs);
* Original code written by Keith Owens. * Original code written by Keith Owens.
*/ */
static void clear_msr_range(unsigned int base, unsigned int n)
{
unsigned int i;
for(i = 0; i < n; ++i)
wrmsr(base+i, 0, 0);
}
static void setup_k7_watchdog(void) static void setup_k7_watchdog(void)
{ {
int i; int i;
unsigned int evntsel; unsigned int evntsel;
/* No check, so can start with slow frequency */
nmi_hz = 1;
/* XXX should check these in EFER */
nmi_perfctr_msr = MSR_K7_PERFCTR0; nmi_perfctr_msr = MSR_K7_PERFCTR0;
for(i = 0; i < 4; ++i) { for(i = 0; i < 4; ++i) {
/* Simulator may not support it */ /* Simulator may not support it */
if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
nmi_perfctr_msr = 0;
return; return;
}
wrmsrl(MSR_K7_PERFCTR0+i, 0UL); wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
} }
...@@ -333,12 +367,54 @@ static void setup_k7_watchdog(void) ...@@ -333,12 +367,54 @@ static void setup_k7_watchdog(void)
| K7_NMI_EVENT; | K7_NMI_EVENT;
wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
apic_write(APIC_LVTPC, APIC_DM_NMI); apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= K7_EVNTSEL_ENABLE; evntsel |= K7_EVNTSEL_ENABLE;
wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
} }
static int setup_p4_watchdog(void)
{
unsigned int misc_enable, dummy;
rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
return 0;
nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
#ifdef CONFIG_SMP
if (smp_num_siblings == 2)
nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
#endif
if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
clear_msr_range(0x3F1, 2);
/* MSR 0x3F0 seems to have a default value of 0xFC00, but current
docs doesn't fully define it, so leave it alone for now. */
if (boot_cpu_data.x86_model >= 0x3) {
/* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
clear_msr_range(0x3A0, 26);
clear_msr_range(0x3BC, 3);
} else {
clear_msr_range(0x3A0, 31);
}
clear_msr_range(0x3C0, 6);
clear_msr_range(0x3C8, 6);
clear_msr_range(0x3E0, 2);
clear_msr_range(MSR_P4_CCCR0, 18);
clear_msr_range(MSR_P4_PERFCTR0, 18);
wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
apic_write(APIC_LVTPC, APIC_DM_NMI);
wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
return 1;
}
void setup_apic_nmi_watchdog(void) void setup_apic_nmi_watchdog(void)
{ {
switch (boot_cpu_data.x86_vendor) { switch (boot_cpu_data.x86_vendor) {
...@@ -349,6 +425,13 @@ void setup_apic_nmi_watchdog(void) ...@@ -349,6 +425,13 @@ void setup_apic_nmi_watchdog(void)
return; return;
setup_k7_watchdog(); setup_k7_watchdog();
break; break;
case X86_VENDOR_INTEL:
if (boot_cpu_data.x86 != 15)
return;
if (!setup_p4_watchdog())
return;
break;
default: default:
return; return;
} }
...@@ -363,56 +446,67 @@ void setup_apic_nmi_watchdog(void) ...@@ -363,56 +446,67 @@ void setup_apic_nmi_watchdog(void)
* *
* as these watchdog NMI IRQs are generated on every CPU, we only * as these watchdog NMI IRQs are generated on every CPU, we only
* have to check the current processor. * have to check the current processor.
*
* since NMIs don't listen to _any_ locks, we have to be extremely
* careful not to rely on unsafe variables. The printk might lock
* up though, so we have to break up any console locks first ...
* [when there will be more tty-related locks, break them up
* here too!]
*/ */
static unsigned int static DEFINE_PER_CPU(unsigned, last_irq_sum);
last_irq_sums [NR_CPUS], static DEFINE_PER_CPU(local_t, alert_counter);
alert_counter [NR_CPUS]; static DEFINE_PER_CPU(int, nmi_touch);
void touch_nmi_watchdog (void) void touch_nmi_watchdog (void)
{ {
int i; int i;
/* /*
* Just reset the alert counters, (other CPUs might be * Tell other CPUs to reset their alert counters. We cannot
* spinning on locks we hold): * do it ourselves because the alert count increase is not
* atomic.
*/ */
for (i = 0; i < NR_CPUS; i++) for (i = 0; i < NR_CPUS; i++)
alert_counter[i] = 0; per_cpu(nmi_touch, i) = 1;
} }
void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
{ {
int sum, cpu; int sum;
int touched = 0;
cpu = safe_smp_processor_id();
sum = read_pda(apic_timer_irqs); sum = read_pda(apic_timer_irqs);
if (last_irq_sums[cpu] == sum) { if (__get_cpu_var(nmi_touch)) {
__get_cpu_var(nmi_touch) = 0;
touched = 1;
}
if (!touched && __get_cpu_var(last_irq_sum) == sum) {
/* /*
* Ayiee, looks like this CPU is stuck ... * Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ... * wait a few IRQs (5 seconds) before doing the oops ...
*/ */
alert_counter[cpu]++; local_inc(&__get_cpu_var(alert_counter));
if (alert_counter[cpu] == 5*nmi_hz) { if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
== NOTIFY_STOP) { == NOTIFY_STOP) {
alert_counter[cpu] = 0; local_set(&__get_cpu_var(alert_counter), 0);
return; return;
} }
die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
} }
} else { } else {
last_irq_sums[cpu] = sum; __get_cpu_var(last_irq_sum) = sum;
alert_counter[cpu] = 0; local_set(&__get_cpu_var(alert_counter), 0);
}
if (nmi_perfctr_msr) {
if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
/*
* P4 quirks:
* - An overflown perfctr will assert its interrupt
* until the OVF flag in its CCCR is cleared.
* - LVTPC is masked on interrupt and must be
* unmasked by the LVTPC handler.
*/
wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
apic_write(APIC_LVTPC, APIC_DM_NMI);
} }
if (nmi_perfctr_msr)
wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
}
} }
static int dummy_nmi_callback(struct pt_regs * regs, int cpu) static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <asm/kdebug.h> #include <asm/kdebug.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/proto.h> #include <asm/proto.h>
#include <asm/nmi.h>
/* Change for real CPU hotplug. Note other files need to be fixed /* Change for real CPU hotplug. Note other files need to be fixed
first too. */ first too. */
...@@ -1030,4 +1031,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus) ...@@ -1030,4 +1031,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus)
detect_siblings(); detect_siblings();
time_init_gtod(); time_init_gtod();
check_nmi_watchdog();
} }
...@@ -54,4 +54,6 @@ extern void die_nmi(char *str, struct pt_regs *regs); ...@@ -54,4 +54,6 @@ extern void die_nmi(char *str, struct pt_regs *regs);
extern int panic_on_timeout; extern int panic_on_timeout;
extern int unknown_nmi_panic; extern int unknown_nmi_panic;
extern int check_nmi_watchdog(void);
#endif /* ASM_NMI_H */ #endif /* ASM_NMI_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册