nmi.c 22.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 *  linux/arch/i386/nmi.c
 *
 *  NMI watchdog support on APIC systems
 *
 *  Started by Ingo Molnar <mingo@redhat.com>
 *
 *  Fixes:
 *  Mikael Pettersson	: AMD K7 support for local APIC NMI watchdog.
 *  Mikael Pettersson	: Power Management for local APIC NMI watchdog.
 *  Mikael Pettersson	: Pentium 4 support for local APIC NMI watchdog.
 *  Pavel Machek and
 *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
 */

#include <linux/config.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/sysdev.h>
#include <linux/sysctl.h>
23
#include <linux/percpu.h>
L
Linus Torvalds 已提交
24 25 26

#include <asm/smp.h>
#include <asm/nmi.h>
27
#include <asm/kdebug.h>
L
Linus Torvalds 已提交
28 29 30

#include "mach_traps.h"

31 32 33 34 35 36 37 38 39 40 41 42 43 44
/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
 * evtsel_nmi_owner tracks the ownership of the event selection
 * - different performance counters/ event selection may be reserved for
 *   different subsystems this reservation system just tries to coordinate
 *   things a little
 */
static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);

/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
 * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
 */
#define NMI_MAX_COUNTER_BITS 66

L
Linus Torvalds 已提交
45
/* nmi_active:
46 47
 * >0: the lapic NMI watchdog is active, but can be disabled
 * <0: the lapic NMI watchdog has not been set up, and cannot
L
Linus Torvalds 已提交
48
 *     be enabled
49
 *  0: the lapic NMI watchdog is disabled, but can be enabled
L
Linus Torvalds 已提交
50
 */
51
atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
L
Linus Torvalds 已提交
52

53 54
unsigned int nmi_watchdog = NMI_DEFAULT;
static unsigned int nmi_hz = HZ;
L
Linus Torvalds 已提交
55

56 57 58 59 60 61 62 63
struct nmi_watchdog_ctlblk {
	int enabled;
	u64 check_bit;
	unsigned int cccr_msr;
	unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
	unsigned int evntsel_msr;  /* the MSR to select the events to handle */
};
static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
L
Linus Torvalds 已提交
64

65 66 67 68 69
/* local prototypes */
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);

extern void show_registers(struct pt_regs *regs);
extern int unknown_nmi_panic;
L
Linus Torvalds 已提交
70

71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
	/* returns the bit offset of the performance counter register */
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_AMD:
		return (msr - MSR_K7_PERFCTR0);
	case X86_VENDOR_INTEL:
		switch (boot_cpu_data.x86) {
		case 6:
			return (msr - MSR_P6_PERFCTR0);
		case 15:
			return (msr - MSR_P4_BPU_PERFCTR0);
		}
	}
	return 0;
}

/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
	/* returns the bit offset of the event selection register */
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_AMD:
		return (msr - MSR_K7_EVNTSEL0);
	case X86_VENDOR_INTEL:
		switch (boot_cpu_data.x86) {
		case 6:
			return (msr - MSR_P6_EVNTSEL0);
		case 15:
			return (msr - MSR_P4_BSU_ESCR0);
		}
	}
	return 0;
}

/* checks for a bit availability (hack for oprofile) */
int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
{
	BUG_ON(counter > NMI_MAX_COUNTER_BITS);

	return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
}

/* checks the an msr for availability */
int avail_to_resrv_perfctr_nmi(unsigned int msr)
{
	unsigned int counter;

	counter = nmi_perfctr_msr_to_bit(msr);
	BUG_ON(counter > NMI_MAX_COUNTER_BITS);

	return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
}

int reserve_perfctr_nmi(unsigned int msr)
{
	unsigned int counter;

	counter = nmi_perfctr_msr_to_bit(msr);
	BUG_ON(counter > NMI_MAX_COUNTER_BITS);

	if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
		return 1;
	return 0;
}

void release_perfctr_nmi(unsigned int msr)
{
	unsigned int counter;

	counter = nmi_perfctr_msr_to_bit(msr);
	BUG_ON(counter > NMI_MAX_COUNTER_BITS);

	clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
}

int reserve_evntsel_nmi(unsigned int msr)
{
	unsigned int counter;

	counter = nmi_evntsel_msr_to_bit(msr);
	BUG_ON(counter > NMI_MAX_COUNTER_BITS);

	if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
		return 1;
	return 0;
}

void release_evntsel_nmi(unsigned int msr)
{
	unsigned int counter;

	counter = nmi_evntsel_msr_to_bit(msr);
	BUG_ON(counter > NMI_MAX_COUNTER_BITS);

	clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
}

170 171 172 173 174 175 176 177 178 179 180
static __cpuinit inline int nmi_known_cpu(void)
{
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_AMD:
		return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
	case X86_VENDOR_INTEL:
		return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
	}
	return 0;
}

181 182 183 184 185 186 187 188
#ifdef CONFIG_SMP
/* The performance counters used by NMI_LOCAL_APIC don't trigger when
 * the CPU is idle. To make sure the NMI watchdog really ticks on all
 * CPUs during the test make them busy.
 */
static __init void nmi_cpu_busy(void *data)
{
	volatile int *endflag = data;
189
	local_irq_enable_in_hardirq();
190 191 192 193 194 195 196 197 198 199 200
	/* Intentionally don't use cpu_relax here. This is
	   to make sure that the performance counter really ticks,
	   even if there is a simulator or similar that catches the
	   pause instruction. On a real HT machine this is fine because
	   all other CPUs are busy with "useless" delay loops and don't
	   care if they get somewhat less cycles. */
	while (*endflag == 0)
		barrier();
}
#endif

201
static int __init check_nmi_watchdog(void)
L
Linus Torvalds 已提交
202
{
203 204
	volatile int endflag = 0;
	unsigned int *prev_nmi_count;
L
Linus Torvalds 已提交
205 206
	int cpu;

207 208 209 210
	if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
		return 0;

	if (!atomic_read(&nmi_active))
211 212
		return 0;

213 214 215 216
	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
	if (!prev_nmi_count)
		return -1;

217
	printk(KERN_INFO "Testing NMI watchdog ... ");
L
Linus Torvalds 已提交
218

219 220 221
	if (nmi_watchdog == NMI_LOCAL_APIC)
		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);

222
	for_each_possible_cpu(cpu)
L
Linus Torvalds 已提交
223 224 225 226
		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
	local_irq_enable();
	mdelay((10*1000)/nmi_hz); // wait 10 ticks

227
	for_each_possible_cpu(cpu) {
L
Linus Torvalds 已提交
228 229 230 231 232 233
#ifdef CONFIG_SMP
		/* Check cpu_callin_map here because that is set
		   after the timer is started. */
		if (!cpu_isset(cpu, cpu_callin_map))
			continue;
#endif
234 235
		if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
			continue;
L
Linus Torvalds 已提交
236
		if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
237 238 239 240
			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
				cpu,
				prev_nmi_count[cpu],
				nmi_count(cpu));
241 242
			per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
			atomic_dec(&nmi_active);
L
Linus Torvalds 已提交
243 244
		}
	}
245 246 247 248 249
	if (!atomic_read(&nmi_active)) {
		kfree(prev_nmi_count);
		atomic_set(&nmi_active, -1);
		return -1;
	}
250
	endflag = 1;
L
Linus Torvalds 已提交
251 252 253 254 255 256 257
	printk("OK.\n");

	/* now that we know it works we can reduce NMI frequency to
	   something more reasonable; makes a difference in some configs */
	if (nmi_watchdog == NMI_LOCAL_APIC)
		nmi_hz = 1;

258
	kfree(prev_nmi_count);
L
Linus Torvalds 已提交
259 260
	return 0;
}
261 262
/* This needs to happen later in boot so counters are working */
late_initcall(check_nmi_watchdog);
L
Linus Torvalds 已提交
263 264 265 266 267 268 269

static int __init setup_nmi_watchdog(char *str)
{
	int nmi;

	get_option(&str, &nmi);

270
	if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
L
Linus Torvalds 已提交
271 272 273 274 275 276
		return 0;
	/*
	 * If any other x86 CPU has a local APIC, then
	 * please test the NMI stuff there and send me the
	 * missing bits. Right now Intel P6/P4 and AMD K7 only.
	 */
277 278 279
	if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
		return 0;  /* no lapic support */
	nmi_watchdog = nmi;
L
Linus Torvalds 已提交
280 281 282 283 284 285 286
	return 1;
}

__setup("nmi_watchdog=", setup_nmi_watchdog);

static void disable_lapic_nmi_watchdog(void)
{
287 288 289
	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);

	if (atomic_read(&nmi_active) <= 0)
L
Linus Torvalds 已提交
290 291
		return;

292
	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
L
Linus Torvalds 已提交
293

294
	BUG_ON(atomic_read(&nmi_active) != 0);
L
Linus Torvalds 已提交
295 296 297 298
}

static void enable_lapic_nmi_watchdog(void)
{
299 300 301 302 303 304 305 306 307 308 309 310
	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);

	/* are we already enabled */
	if (atomic_read(&nmi_active) != 0)
		return;

	/* are we lapic aware */
	if (nmi_known_cpu() <= 0)
		return;

	on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
	touch_nmi_watchdog();
L
Linus Torvalds 已提交
311 312 313 314
}

void disable_timer_nmi_watchdog(void)
{
315 316 317
	BUG_ON(nmi_watchdog != NMI_IO_APIC);

	if (atomic_read(&nmi_active) <= 0)
L
Linus Torvalds 已提交
318 319
		return;

320 321 322 323
	disable_irq(0);
	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);

	BUG_ON(atomic_read(&nmi_active) != 0);
L
Linus Torvalds 已提交
324 325 326 327
}

void enable_timer_nmi_watchdog(void)
{
328 329 330
	BUG_ON(nmi_watchdog != NMI_IO_APIC);

	if (atomic_read(&nmi_active) == 0) {
L
Linus Torvalds 已提交
331
		touch_nmi_watchdog();
332 333
		on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
		enable_irq(0);
L
Linus Torvalds 已提交
334 335 336 337 338 339 340
	}
}

#ifdef CONFIG_PM

static int nmi_pm_active; /* nmi_active before suspend */

341
static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
L
Linus Torvalds 已提交
342
{
343
	/* only CPU0 goes here, other CPUs should be offline */
344
	nmi_pm_active = atomic_read(&nmi_active);
345 346
	stop_apic_nmi_watchdog(NULL);
	BUG_ON(atomic_read(&nmi_active) != 0);
L
Linus Torvalds 已提交
347 348 349 350 351
	return 0;
}

static int lapic_nmi_resume(struct sys_device *dev)
{
352 353 354 355 356
	/* only CPU0 goes here, other CPUs should be offline */
	if (nmi_pm_active > 0) {
		setup_apic_nmi_watchdog(NULL);
		touch_nmi_watchdog();
	}
L
Linus Torvalds 已提交
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
	return 0;
}


static struct sysdev_class nmi_sysclass = {
	set_kset_name("lapic_nmi"),
	.resume		= lapic_nmi_resume,
	.suspend	= lapic_nmi_suspend,
};

static struct sys_device device_lapic_nmi = {
	.id	= 0,
	.cls	= &nmi_sysclass,
};

static int __init init_lapic_nmi_sysfs(void)
{
	int error;

376 377 378 379 380 381 382
	/* should really be a BUG_ON but b/c this is an
	 * init call, it just doesn't work.  -dcz
	 */
	if (nmi_watchdog != NMI_LOCAL_APIC)
		return 0;

	if ( atomic_read(&nmi_active) < 0 )
L
Linus Torvalds 已提交
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
		return 0;

	error = sysdev_class_register(&nmi_sysclass);
	if (!error)
		error = sysdev_register(&device_lapic_nmi);
	return error;
}
/* must come after the local APIC's device_initcall() */
late_initcall(init_lapic_nmi_sysfs);

#endif	/* CONFIG_PM */

/*
 * Activate the NMI watchdog via the local APIC.
 * Original code written by Keith Owens.
 */

400
static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
401 402 403 404 405 406
{
	u64 count = (u64)cpu_khz * 1000;

	do_div(count, nmi_hz);
	if(descr)
		Dprintk("setting %s to -0x%08Lx\n", descr, count);
407
	wrmsrl(perfctr_msr, 0 - count);
408 409
}

410 411 412 413 414 415 416 417 418 419
/* Note that these events don't tick when the CPU idles. This means
   the frequency varies with CPU load. */

#define K7_EVNTSEL_ENABLE	(1 << 22)
#define K7_EVNTSEL_INT		(1 << 20)
#define K7_EVNTSEL_OS		(1 << 17)
#define K7_EVNTSEL_USR		(1 << 16)
#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
#define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING

420
static int setup_k7_watchdog(void)
L
Linus Torvalds 已提交
421
{
422
	unsigned int perfctr_msr, evntsel_msr;
L
Linus Torvalds 已提交
423
	unsigned int evntsel;
424
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
L
Linus Torvalds 已提交
425

426 427 428
	perfctr_msr = MSR_K7_PERFCTR0;
	evntsel_msr = MSR_K7_EVNTSEL0;
	if (!reserve_perfctr_nmi(perfctr_msr))
429 430
		goto fail;

431
	if (!reserve_evntsel_nmi(evntsel_msr))
432 433
		goto fail1;

434
	wrmsrl(perfctr_msr, 0UL);
L
Linus Torvalds 已提交
435 436 437 438 439 440

	evntsel = K7_EVNTSEL_INT
		| K7_EVNTSEL_OS
		| K7_EVNTSEL_USR
		| K7_NMI_EVENT;

441 442 443
	/* setup the timer */
	wrmsr(evntsel_msr, evntsel, 0);
	write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
L
Linus Torvalds 已提交
444 445
	apic_write(APIC_LVTPC, APIC_DM_NMI);
	evntsel |= K7_EVNTSEL_ENABLE;
446 447 448 449 450 451
	wrmsr(evntsel_msr, evntsel, 0);

	wd->perfctr_msr = perfctr_msr;
	wd->evntsel_msr = evntsel_msr;
	wd->cccr_msr = 0;  //unused
	wd->check_bit = 1ULL<<63;
452 453
	return 1;
fail1:
454
	release_perfctr_nmi(perfctr_msr);
455 456
fail:
	return 0;
L
Linus Torvalds 已提交
457 458
}

459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
static void stop_k7_watchdog(void)
{
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);

	wrmsr(wd->evntsel_msr, 0, 0);

	release_evntsel_nmi(wd->evntsel_msr);
	release_perfctr_nmi(wd->perfctr_msr);
}

#define P6_EVNTSEL0_ENABLE	(1 << 22)
#define P6_EVNTSEL_INT		(1 << 20)
#define P6_EVNTSEL_OS		(1 << 17)
#define P6_EVNTSEL_USR		(1 << 16)
#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED

476
static int setup_p6_watchdog(void)
L
Linus Torvalds 已提交
477
{
478
	unsigned int perfctr_msr, evntsel_msr;
L
Linus Torvalds 已提交
479
	unsigned int evntsel;
480
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
L
Linus Torvalds 已提交
481

482 483 484
	perfctr_msr = MSR_P6_PERFCTR0;
	evntsel_msr = MSR_P6_EVNTSEL0;
	if (!reserve_perfctr_nmi(perfctr_msr))
485 486
		goto fail;

487
	if (!reserve_evntsel_nmi(evntsel_msr))
488
		goto fail1;
L
Linus Torvalds 已提交
489

490 491
	wrmsrl(perfctr_msr, 0UL);

L
Linus Torvalds 已提交
492 493 494 495 496
	evntsel = P6_EVNTSEL_INT
		| P6_EVNTSEL_OS
		| P6_EVNTSEL_USR
		| P6_NMI_EVENT;

497 498 499
	/* setup the timer */
	wrmsr(evntsel_msr, evntsel, 0);
	write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
L
Linus Torvalds 已提交
500 501
	apic_write(APIC_LVTPC, APIC_DM_NMI);
	evntsel |= P6_EVNTSEL0_ENABLE;
502 503 504 505 506 507
	wrmsr(evntsel_msr, evntsel, 0);

	wd->perfctr_msr = perfctr_msr;
	wd->evntsel_msr = evntsel_msr;
	wd->cccr_msr = 0;  //unused
	wd->check_bit = 1ULL<<39;
508 509
	return 1;
fail1:
510
	release_perfctr_nmi(perfctr_msr);
511 512
fail:
	return 0;
L
Linus Torvalds 已提交
513 514
}

515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544
static void stop_p6_watchdog(void)
{
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);

	wrmsr(wd->evntsel_msr, 0, 0);

	release_evntsel_nmi(wd->evntsel_msr);
	release_perfctr_nmi(wd->perfctr_msr);
}

/* Note that these events don't tick when the CPU idles. This means
   the frequency varies with CPU load. */

#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
#define P4_ESCR_OS		(1<<3)
#define P4_ESCR_USR		(1<<2)
#define P4_CCCR_OVF_PMI0	(1<<26)
#define P4_CCCR_OVF_PMI1	(1<<27)
#define P4_CCCR_THRESHOLD(N)	((N)<<20)
#define P4_CCCR_COMPLEMENT	(1<<19)
#define P4_CCCR_COMPARE		(1<<18)
#define P4_CCCR_REQUIRED	(3<<16)
#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
#define P4_CCCR_ENABLE		(1<<12)
#define P4_CCCR_OVF 		(1<<31)
/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
   CRU_ESCR0 (with any non-null event selector) through a complemented
   max threshold. [IA32-Vol3, Section 14.9.9] */

L
Linus Torvalds 已提交
545 546
static int setup_p4_watchdog(void)
{
547 548
	unsigned int perfctr_msr, evntsel_msr, cccr_msr;
	unsigned int evntsel, cccr_val;
L
Linus Torvalds 已提交
549
	unsigned int misc_enable, dummy;
550 551
	unsigned int ht_num;
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
L
Linus Torvalds 已提交
552

553
	rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
L
Linus Torvalds 已提交
554 555 556 557
	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
		return 0;

#ifdef CONFIG_SMP
558 559 560 561 562 563 564 565
	/* detect which hyperthread we are on */
	if (smp_num_siblings == 2) {
		unsigned int ebx, apicid;

        	ebx = cpuid_ebx(1);
	        apicid = (ebx >> 24) & 0xff;
        	ht_num = apicid & 1;
	} else
L
Linus Torvalds 已提交
566
#endif
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
		ht_num = 0;

	/* performance counters are shared resources
	 * assign each hyperthread its own set
	 * (re-use the ESCR0 register, seems safe
	 * and keeps the cccr_val the same)
	 */
	if (!ht_num) {
		/* logical cpu 0 */
		perfctr_msr = MSR_P4_IQ_PERFCTR0;
		evntsel_msr = MSR_P4_CRU_ESCR0;
		cccr_msr = MSR_P4_IQ_CCCR0;
		cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
	} else {
		/* logical cpu 1 */
		perfctr_msr = MSR_P4_IQ_PERFCTR1;
		evntsel_msr = MSR_P4_CRU_ESCR0;
		cccr_msr = MSR_P4_IQ_CCCR1;
		cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
	}
L
Linus Torvalds 已提交
587

588
	if (!reserve_perfctr_nmi(perfctr_msr))
589 590
		goto fail;

591
	if (!reserve_evntsel_nmi(evntsel_msr))
592
		goto fail1;
L
Linus Torvalds 已提交
593

594 595 596 597 598 599 600 601 602 603 604 605
	evntsel = P4_ESCR_EVENT_SELECT(0x3F)
	 	| P4_ESCR_OS
		| P4_ESCR_USR;

	cccr_val |= P4_CCCR_THRESHOLD(15)
		 | P4_CCCR_COMPLEMENT
		 | P4_CCCR_COMPARE
		 | P4_CCCR_REQUIRED;

	wrmsr(evntsel_msr, evntsel, 0);
	wrmsr(cccr_msr, cccr_val, 0);
	write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
L
Linus Torvalds 已提交
606
	apic_write(APIC_LVTPC, APIC_DM_NMI);
607 608 609 610 611 612
	cccr_val |= P4_CCCR_ENABLE;
	wrmsr(cccr_msr, cccr_val, 0);
	wd->perfctr_msr = perfctr_msr;
	wd->evntsel_msr = evntsel_msr;
	wd->cccr_msr = cccr_msr;
	wd->check_bit = 1ULL<<39;
L
Linus Torvalds 已提交
613
	return 1;
614
fail1:
615
	release_perfctr_nmi(perfctr_msr);
616 617
fail:
	return 0;
L
Linus Torvalds 已提交
618 619
}

620
static void stop_p4_watchdog(void)
L
Linus Torvalds 已提交
621
{
622 623 624 625
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);

	wrmsr(wd->cccr_msr, 0, 0);
	wrmsr(wd->evntsel_msr, 0, 0);
L
Linus Torvalds 已提交
626

627 628 629 630 631 632
	release_evntsel_nmi(wd->evntsel_msr);
	release_perfctr_nmi(wd->perfctr_msr);
}

void setup_apic_nmi_watchdog (void *unused)
{
633 634
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);

635 636 637 638 639
	/* only support LOCAL and IO APICs for now */
	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
	    (nmi_watchdog != NMI_IO_APIC))
	    	return;

640 641 642 643 644 645 646 647
	if (wd->enabled == 1)
		return;

	/* cheap hack to support suspend/resume */
	/* if cpu0 is not active neither should the other cpus */
	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
		return;

648 649 650 651
	if (nmi_watchdog == NMI_LOCAL_APIC) {
		switch (boot_cpu_data.x86_vendor) {
		case X86_VENDOR_AMD:
			if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
652
				return;
653
			if (!setup_k7_watchdog())
L
Linus Torvalds 已提交
654
				return;
655 656 657 658 659 660 661 662 663 664 665 666 667
			break;
		case X86_VENDOR_INTEL:
			switch (boot_cpu_data.x86) {
			case 6:
				if (boot_cpu_data.x86_model > 0xd)
					return;

				if (!setup_p6_watchdog())
					return;
				break;
			case 15:
				if (boot_cpu_data.x86_model > 0x4)
					return;
L
Linus Torvalds 已提交
668

669 670 671 672
				if (!setup_p4_watchdog())
					return;
				break;
			default:
L
Linus Torvalds 已提交
673
				return;
674 675 676 677 678 679
			}
			break;
		default:
			return;
		}
	}
680
	wd->enabled = 1;
681 682 683
	atomic_inc(&nmi_active);
}

684
void stop_apic_nmi_watchdog(void *unused)
685
{
686 687
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);

688 689 690 691 692
	/* only support LOCAL and IO APICs for now */
	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
	    (nmi_watchdog != NMI_IO_APIC))
	    	return;

693 694 695
	if (wd->enabled == 0)
		return;

696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
	if (nmi_watchdog == NMI_LOCAL_APIC) {
		switch (boot_cpu_data.x86_vendor) {
		case X86_VENDOR_AMD:
			stop_k7_watchdog();
			break;
		case X86_VENDOR_INTEL:
			switch (boot_cpu_data.x86) {
			case 6:
				if (boot_cpu_data.x86_model > 0xd)
					break;
				stop_p6_watchdog();
				break;
			case 15:
				if (boot_cpu_data.x86_model > 0x4)
					break;
				stop_p4_watchdog();
				break;
			}
L
Linus Torvalds 已提交
714 715 716 717 718
			break;
		default:
			return;
		}
	}
719
	wd->enabled = 0;
720
	atomic_dec(&nmi_active);
L
Linus Torvalds 已提交
721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749
}

/*
 * the best way to detect whether a CPU has a 'hard lockup' problem
 * is to check it's local APIC timer IRQ counts. If they are not
 * changing then that CPU has some problem.
 *
 * as these watchdog NMI IRQs are generated on every CPU, we only
 * have to check the current processor.
 *
 * since NMIs don't listen to _any_ locks, we have to be extremely
 * careful not to rely on unsafe variables. The printk might lock
 * up though, so we have to break up any console locks first ...
 * [when there will be more tty-related locks, break them up
 *  here too!]
 */

static unsigned int
	last_irq_sums [NR_CPUS],
	alert_counter [NR_CPUS];

void touch_nmi_watchdog (void)
{
	int i;

	/*
	 * Just reset the alert counters, (other CPUs might be
	 * spinning on locks we hold):
	 */
750
	for_each_possible_cpu(i)
L
Linus Torvalds 已提交
751
		alert_counter[i] = 0;
I
Ingo Molnar 已提交
752 753 754 755 756

	/*
	 * Tickle the softlockup detector too:
	 */
	touch_softlockup_watchdog();
L
Linus Torvalds 已提交
757
}
758
EXPORT_SYMBOL(touch_nmi_watchdog);
L
Linus Torvalds 已提交
759 760 761

extern void die_nmi(struct pt_regs *, const char *msg);

762
int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
L
Linus Torvalds 已提交
763 764 765 766 767 768 769
{

	/*
	 * Since current_thread_info()-> is always on the stack, and we
	 * always switch the stack NMI-atomically, it's safe to use
	 * smp_processor_id().
	 */
770
	unsigned int sum;
771
	int touched = 0;
772
	int cpu = smp_processor_id();
773 774
	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
	u64 dummy;
775
	int rc=0;
776 777 778 779

	/* check for other users first */
	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
			== NOTIFY_STOP) {
780
		rc = 1;
781 782
		touched = 1;
	}
L
Linus Torvalds 已提交
783 784 785

	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;

786 787
	/* if the apic timer isn't firing, this cpu isn't doing much */
	if (!touched && last_irq_sums[cpu] == sum) {
L
Linus Torvalds 已提交
788 789 790 791 792 793
		/*
		 * Ayiee, looks like this CPU is stuck ...
		 * wait a few IRQs (5 seconds) before doing the oops ...
		 */
		alert_counter[cpu]++;
		if (alert_counter[cpu] == 5*nmi_hz)
794 795 796
			/*
			 * die_nmi will return ONLY if NOTIFY_STOP happens..
			 */
797
			die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
798
	} else {
L
Linus Torvalds 已提交
799 800 801
		last_irq_sums[cpu] = sum;
		alert_counter[cpu] = 0;
	}
802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
	/* see if the nmi watchdog went off */
	if (wd->enabled) {
		if (nmi_watchdog == NMI_LOCAL_APIC) {
			rdmsrl(wd->perfctr_msr, dummy);
			if (dummy & wd->check_bit){
				/* this wasn't a watchdog timer interrupt */
				goto done;
			}

			/* only Intel P4 uses the cccr msr */
	 		if (wd->cccr_msr != 0) {
	 			/*
	 			 * P4 quirks:
	 			 * - An overflown perfctr will assert its interrupt
	 			 *   until the OVF flag in its CCCR is cleared.
	 			 * - LVTPC is masked on interrupt and must be
	 			 *   unmasked by the LVTPC handler.
	 			 */
				rdmsrl(wd->cccr_msr, dummy);
				dummy &= ~P4_CCCR_OVF;
	 			wrmsrl(wd->cccr_msr, dummy);
	 			apic_write(APIC_LVTPC, APIC_DM_NMI);
	 		}
			else if (wd->perfctr_msr == MSR_P6_PERFCTR0) {
				/* Only P6 based Pentium M need to re-unmask
				 * the apic vector but it doesn't hurt
				 * other P6 variant */
				apic_write(APIC_LVTPC, APIC_DM_NMI);
			}
			/* start the cycle over again */
			write_watchdog_counter(wd->perfctr_msr, NULL);
833 834 835 836 837 838 839 840 841
			rc = 1;
		} else if (nmi_watchdog == NMI_IO_APIC) {
			/* don't know how to accurately check for this.
			 * just assume it was a watchdog timer interrupt
			 * This matches the old behaviour.
			 */
			rc = 1;
		} else
			printk(KERN_WARNING "Unknown enabled NMI hardware?!\n");
L
Linus Torvalds 已提交
842
	}
843
done:
844
	return rc;
L
Linus Torvalds 已提交
845 846
}

847 848 849 850 851 852 853 854 855
int do_nmi_callback(struct pt_regs * regs, int cpu)
{
#ifdef CONFIG_SYSCTL
	if (unknown_nmi_panic)
		return unknown_nmi_panic_callback(regs, cpu);
#endif
	return 0;
}

L
Linus Torvalds 已提交
856 857 858 859 860 861 862
#ifdef CONFIG_SYSCTL

static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
{
	unsigned char reason = get_nmi_reason();
	char buf[64];

863 864
	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
	die_nmi(regs, buf);
L
Linus Torvalds 已提交
865 866 867
	return 0;
}

868
/*
869
 * proc handler for /proc/sys/kernel/nmi
870 871 872 873 874 875 876 877 878 879 880 881 882
 */
int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
			void __user *buffer, size_t *length, loff_t *ppos)
{
	int old_state;

	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
	old_state = nmi_watchdog_enabled;
	proc_dointvec(table, write, file, buffer, length, ppos);
	if (!!old_state == !!nmi_watchdog_enabled)
		return 0;

	if (atomic_read(&nmi_active) < 0) {
883 884
		printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
		return -EIO;
885 886 887 888 889 890 891 892 893
	}

	if (nmi_watchdog == NMI_DEFAULT) {
		if (nmi_known_cpu() > 0)
			nmi_watchdog = NMI_LOCAL_APIC;
		else
			nmi_watchdog = NMI_IO_APIC;
	}

894
	if (nmi_watchdog == NMI_LOCAL_APIC) {
895 896 897 898 899 900 901 902 903 904 905 906
		if (nmi_watchdog_enabled)
			enable_lapic_nmi_watchdog();
		else
			disable_lapic_nmi_watchdog();
	} else {
		printk( KERN_WARNING
			"NMI watchdog doesn't know what hardware to touch\n");
		return -EIO;
	}
	return 0;
}

L
Linus Torvalds 已提交
907 908 909 910
#endif

EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
911 912 913 914 915 916
EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
EXPORT_SYMBOL(reserve_perfctr_nmi);
EXPORT_SYMBOL(release_perfctr_nmi);
EXPORT_SYMBOL(reserve_evntsel_nmi);
EXPORT_SYMBOL(release_evntsel_nmi);
L
Linus Torvalds 已提交
917 918
EXPORT_SYMBOL(disable_timer_nmi_watchdog);
EXPORT_SYMBOL(enable_timer_nmi_watchdog);