mce.c 32.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 * Machine check handler.
I
Ingo Molnar 已提交
3
 *
L
Linus Torvalds 已提交
4
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 6
 * Rest from unknown author(s).
 * 2004 Andi Kleen. Rewrote most of it.
7 8
 * Copyright 2008 Intel Corporation
 * Author: Andi Kleen
L
Linus Torvalds 已提交
9
 */
I
Ingo Molnar 已提交
10 11 12
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
13
#include <linux/interrupt.h>
I
Ingo Molnar 已提交
14 15 16 17
#include <linux/ratelimit.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/kobject.h>
18
#include <linux/uaccess.h>
I
Ingo Molnar 已提交
19 20 21
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
L
Linus Torvalds 已提交
22 23
#include <linux/string.h>
#include <linux/sysdev.h>
24
#include <linux/ctype.h>
I
Ingo Molnar 已提交
25
#include <linux/sched.h>
26
#include <linux/sysfs.h>
I
Ingo Molnar 已提交
27 28 29 30 31
#include <linux/types.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
#include <linux/cpu.h>
32
#include <linux/smp.h>
I
Ingo Molnar 已提交
33 34
#include <linux/fs.h>

35
#include <asm/processor.h>
36 37
#include <asm/hw_irq.h>
#include <asm/apic.h>
38
#include <asm/idle.h>
39
#include <asm/ipi.h>
I
Ingo Molnar 已提交
40 41
#include <asm/mce.h>
#include <asm/msr.h>
L
Linus Torvalds 已提交
42

43
#include "mce-internal.h"
44 45
#include "mce.h"

46 47 48 49 50 51 52 53 54 55
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
	       smp_processor_id());
}

/* Call the installed machine check handler for this CPU setup. */
void (*machine_check_vector)(struct pt_regs *, long error_code) =
						unexpected_machine_check;
56 57 58

int				mce_disabled;

59
#ifdef CONFIG_X86_NEW_MCE
60

I
Ingo Molnar 已提交
61
#define MISC_MCELOG_MINOR	227
62

63 64
atomic_t mce_entry;

65 66
DEFINE_PER_CPU(unsigned, mce_exception_count);

67 68 69 70 71 72 73
/*
 * Tolerant levels:
 *   0: always panic on uncorrected errors, log corrected errors
 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
 *   3: never panic or SIGBUS, log all errors (for testing only)
 */
I
Ingo Molnar 已提交
74 75 76 77 78 79
static int			tolerant = 1;
static int			banks;
static u64			*bank;
static unsigned long		notify_user;
static int			rip_msr;
static int			mce_bootlog = -1;
80

I
Ingo Molnar 已提交
81 82
static char			trigger[128];
static char			*trigger_argv[2] = { trigger, NULL };
L
Linus Torvalds 已提交
83

84 85
static unsigned long		dont_init_banks;

86 87
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);

88 89 90 91 92
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};

93 94 95 96 97
static inline int skip_bank_init(int i)
{
	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
}

98 99 100 101
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
	memset(m, 0, sizeof(struct mce));
102
	m->cpu = m->extcpu = smp_processor_id();
103
	rdtscll(m->tsc);
104 105 106 107 108 109 110 111 112
	/* We hope get_seconds stays lockless */
	m->time = get_seconds();
	m->cpuvendor = boot_cpu_data.x86_vendor;
	m->cpuid = cpuid_eax(1);
#ifdef CONFIG_SMP
	m->socketid = cpu_data(m->extcpu).phys_proc_id;
#endif
	m->apicid = cpu_data(m->extcpu).initial_apicid;
	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
113 114
}

115 116 117
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);

L
Linus Torvalds 已提交
118 119 120 121 122 123
/*
 * Lockless MCE logging infrastructure.
 * This avoids deadlocks on printk locks without having to break locks. Also
 * separate MCEs from kernel messages to avoid bogus bug reports.
 */

124
static struct mce_log mcelog = {
125 126 127
	.signature	= MCE_LOG_SIGNATURE,
	.len		= MCE_LOG_LEN,
	.recordlen	= sizeof(struct mce),
128
};
L
Linus Torvalds 已提交
129 130 131 132

void mce_log(struct mce *mce)
{
	unsigned next, entry;
I
Ingo Molnar 已提交
133

L
Linus Torvalds 已提交
134
	mce->finished = 0;
M
Mike Waychison 已提交
135
	wmb();
L
Linus Torvalds 已提交
136 137
	for (;;) {
		entry = rcu_dereference(mcelog.next);
138
		for (;;) {
I
Ingo Molnar 已提交
139 140 141 142 143
			/*
			 * When the buffer fills up discard new entries.
			 * Assume that the earlier errors are the more
			 * interesting ones:
			 */
144
			if (entry >= MCE_LOG_LEN) {
145 146
				set_bit(MCE_OVERFLOW,
					(unsigned long *)&mcelog.flags);
147 148
				return;
			}
I
Ingo Molnar 已提交
149
			/* Old left over entry. Skip: */
150 151 152 153
			if (mcelog.entry[entry].finished) {
				entry++;
				continue;
			}
M
Mike Waychison 已提交
154
			break;
L
Linus Torvalds 已提交
155 156 157 158 159 160 161
		}
		smp_rmb();
		next = entry + 1;
		if (cmpxchg(&mcelog.next, entry, next) == entry)
			break;
	}
	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
M
Mike Waychison 已提交
162
	wmb();
L
Linus Torvalds 已提交
163
	mcelog.entry[entry].finished = 1;
M
Mike Waychison 已提交
164
	wmb();
L
Linus Torvalds 已提交
165

166
	mce->finished = 1;
167
	set_bit(0, &notify_user);
L
Linus Torvalds 已提交
168 169 170 171 172
}

static void print_mce(struct mce *m)
{
	printk(KERN_EMERG "\n"
173
	       KERN_EMERG "HARDWARE ERROR\n"
L
Linus Torvalds 已提交
174 175
	       KERN_EMERG
	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
176
	       m->extcpu, m->mcgstatus, m->bank, m->status);
177
	if (m->ip) {
178
		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
L
Linus Torvalds 已提交
179
		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
180
		       m->cs, m->ip);
L
Linus Torvalds 已提交
181
		if (m->cs == __KERNEL_CS)
182
			print_symbol("{%s}", m->ip);
L
Linus Torvalds 已提交
183 184
		printk("\n");
	}
185
	printk(KERN_EMERG "TSC %llx ", m->tsc);
L
Linus Torvalds 已提交
186
	if (m->addr)
187
		printk("ADDR %llx ", m->addr);
L
Linus Torvalds 已提交
188
	if (m->misc)
189
		printk("MISC %llx ", m->misc);
L
Linus Torvalds 已提交
190
	printk("\n");
191 192 193
	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
			m->cpuvendor, m->cpuid, m->time, m->socketid,
			m->apicid);
194
	printk(KERN_EMERG "This is not a software problem!\n");
195 196
	printk(KERN_EMERG "Run through mcelog --ascii to decode "
	       "and contact your hardware vendor\n");
L
Linus Torvalds 已提交
197 198
}

199
static void mce_panic(char *msg, struct mce *final, char *exp)
200
{
L
Linus Torvalds 已提交
201
	int i;
202

203 204
	bust_spinlocks(1);
	console_verbose();
205
	/* First print corrected ones that are still unlogged */
L
Linus Torvalds 已提交
206
	for (i = 0; i < MCE_LOG_LEN; i++) {
207 208 209 210 211 212 213 214 215
		struct mce *m = &mcelog.entry[i];
		if ((m->status & MCI_STATUS_VAL) &&
			!(m->status & MCI_STATUS_UC))
			print_mce(m);
	}
	/* Now print uncorrected but with the final one last */
	for (i = 0; i < MCE_LOG_LEN; i++) {
		struct mce *m = &mcelog.entry[i];
		if (!(m->status & MCI_STATUS_VAL))
L
Linus Torvalds 已提交
216
			continue;
217 218
		if (!final || memcmp(m, final, sizeof(struct mce)))
			print_mce(m);
L
Linus Torvalds 已提交
219
	}
220 221
	if (final)
		print_mce(final);
222 223
	if (exp)
		printk(KERN_EMERG "Machine check: %s\n", exp);
224
	panic(msg);
225
}
L
Linus Torvalds 已提交
226

227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
/* Support code for software error injection */

static int msr_to_offset(u32 msr)
{
	unsigned bank = __get_cpu_var(injectm.bank);
	if (msr == rip_msr)
		return offsetof(struct mce, ip);
	if (msr == MSR_IA32_MC0_STATUS + bank*4)
		return offsetof(struct mce, status);
	if (msr == MSR_IA32_MC0_ADDR + bank*4)
		return offsetof(struct mce, addr);
	if (msr == MSR_IA32_MC0_MISC + bank*4)
		return offsetof(struct mce, misc);
	if (msr == MSR_IA32_MCG_STATUS)
		return offsetof(struct mce, mcgstatus);
	return -1;
}

245 246 247 248
/* MSR access wrappers used for error injection */
static u64 mce_rdmsrl(u32 msr)
{
	u64 v;
249 250 251 252 253 254
	if (__get_cpu_var(injectm).finished) {
		int offset = msr_to_offset(msr);
		if (offset < 0)
			return 0;
		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
	}
255 256 257 258 259 260
	rdmsrl(msr, v);
	return v;
}

static void mce_wrmsrl(u32 msr, u64 v)
{
261 262 263 264 265 266
	if (__get_cpu_var(injectm).finished) {
		int offset = msr_to_offset(msr);
		if (offset >= 0)
			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
		return;
	}
267 268 269
	wrmsrl(msr, v);
}

A
Andi Kleen 已提交
270
int mce_available(struct cpuinfo_x86 *c)
L
Linus Torvalds 已提交
271
{
272
	if (mce_disabled)
273
		return 0;
274
	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
L
Linus Torvalds 已提交
275 276
}

277 278 279
static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
{
	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
280
		m->ip = regs->ip;
281 282
		m->cs = regs->cs;
	} else {
283
		m->ip = 0;
284 285 286 287 288
		m->cs = 0;
	}
	if (rip_msr) {
		/* Assume the RIP in the MSR is exact. Is this true? */
		m->mcgstatus |= MCG_STATUS_EIPV;
289
		m->ip = mce_rdmsrl(rip_msr);
290 291 292 293
		m->cs = 0;
	}
}

294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
#ifdef CONFIG_X86_LOCAL_APIC 
/*
 * Called after interrupts have been reenabled again
 * when a MCE happened during an interrupts off region
 * in the kernel.
 */
asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
{
	ack_APIC_irq();
	exit_idle();
	irq_enter();
	mce_notify_user();
	irq_exit();
}
#endif

static void mce_report_event(struct pt_regs *regs)
{
	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
		mce_notify_user();
		return;
	}

#ifdef CONFIG_X86_LOCAL_APIC
	/*
	 * Without APIC do not notify. The event will be picked
	 * up eventually.
	 */
	if (!cpu_has_apic)
		return;

	/*
	 * When interrupts are disabled we cannot use
	 * kernel services safely. Trigger an self interrupt
	 * through the APIC to instead do the notification
	 * after interrupts are reenabled again.
	 */
	apic->send_IPI_self(MCE_SELF_VECTOR);

	/*
	 * Wait for idle afterwards again so that we don't leave the
	 * APIC in a non idle state because the normal APIC writes
	 * cannot exclude us.
	 */
	apic_wait_icr_idle();
#endif
}

342 343
DEFINE_PER_CPU(unsigned, mce_poll_count);

344
/*
345 346 347 348 349
 * Poll for corrected events or events that happened before reset.
 * Those are just logged through /dev/mcelog.
 *
 * This is executed in standard interrupt context.
 */
350
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
351 352 353 354
{
	struct mce m;
	int i;

355 356
	__get_cpu_var(mce_poll_count)++;

357 358
	mce_setup(&m);

359
	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
360
	for (i = 0; i < banks; i++) {
361
		if (!bank[i] || !test_bit(i, *b))
362 363 364 365 366 367 368 369
			continue;

		m.misc = 0;
		m.addr = 0;
		m.bank = i;
		m.tsc = 0;

		barrier();
370
		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
371 372 373 374 375 376 377 378 379 380 381 382 383 384
		if (!(m.status & MCI_STATUS_VAL))
			continue;

		/*
		 * Uncorrected events are handled by the exception handler
		 * when it is enabled. But when the exception is disabled log
		 * everything.
		 *
		 * TBD do the same check for MCI_STATUS_EN here?
		 */
		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
			continue;

		if (m.status & MCI_STATUS_MISCV)
385
			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
386
		if (m.status & MCI_STATUS_ADDRV)
387
			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
388 389 390 391 392 393 394

		if (!(flags & MCP_TIMESTAMP))
			m.tsc = 0;
		/*
		 * Don't get the IP here because it's unlikely to
		 * have anything to do with the actual error location.
		 */
A
Andi Kleen 已提交
395 396 397 398
		if (!(flags & MCP_DONTLOG)) {
			mce_log(&m);
			add_taint(TAINT_MACHINE_CHECK);
		}
399 400 401 402

		/*
		 * Clear state for this bank.
		 */
403
		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
404 405 406 407 408 409
	}

	/*
	 * Don't clear MCG_STATUS here because it's only defined for
	 * exceptions.
	 */
410 411

	sync_core();
412
}
413
EXPORT_SYMBOL_GPL(machine_check_poll);
414

415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
/*
 * Do a quick check if any of the events requires a panic.
 * This decides if we keep the events around or clear them.
 */
static int mce_no_way_out(struct mce *m, char **msg)
{
	int i;

	for (i = 0; i < banks; i++) {
		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
			return 1;
	}
	return 0;
}

431 432 433 434 435 436 437
/*
 * The actual machine check handler. This only handles real
 * exceptions when something got corrupted coming in through int 18.
 *
 * This is executed in NMI context not subject to normal locking rules. This
 * implies that most kernel services cannot be safely used. Don't even
 * think about putting a printk in there!
L
Linus Torvalds 已提交
438
 */
I
Ingo Molnar 已提交
439
void do_machine_check(struct pt_regs *regs, long error_code)
L
Linus Torvalds 已提交
440 441
{
	struct mce m, panicm;
I
Ingo Molnar 已提交
442
	int panicm_found = 0;
L
Linus Torvalds 已提交
443
	int i;
444 445 446 447 448 449 450 451 452 453
	/*
	 * If no_way_out gets set, there is no safe way to recover from this
	 * MCE.  If tolerant is cranked up, we'll try anyway.
	 */
	int no_way_out = 0;
	/*
	 * If kill_it gets set, there might be a way to recover from this
	 * error.
	 */
	int kill_it = 0;
454
	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
455
	char *msg = "Unknown";
L
Linus Torvalds 已提交
456

457 458
	atomic_inc(&mce_entry);

459 460
	__get_cpu_var(mce_exception_count)++;

461
	if (notify_die(DIE_NMI, "machine check", regs, error_code,
462
			   18, SIGKILL) == NOTIFY_STOP)
463
		goto out;
464
	if (!banks)
465
		goto out;
L
Linus Torvalds 已提交
466

467 468
	mce_setup(&m);

469
	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
470
	no_way_out = mce_no_way_out(&m, &msg);
471

L
Linus Torvalds 已提交
472 473 474
	barrier();

	for (i = 0; i < banks; i++) {
475
		__clear_bit(i, toclear);
476
		if (!bank[i])
L
Linus Torvalds 已提交
477
			continue;
478 479

		m.misc = 0;
L
Linus Torvalds 已提交
480 481 482
		m.addr = 0;
		m.bank = i;

483
		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
L
Linus Torvalds 已提交
484 485 486
		if ((m.status & MCI_STATUS_VAL) == 0)
			continue;

487 488
		/*
		 * Non uncorrected errors are handled by machine_check_poll
489
		 * Leave them alone, unless this panics.
490
		 */
491
		if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
492 493 494 495 496 497 498 499 500
			continue;

		/*
		 * Set taint even when machine check was not enabled.
		 */
		add_taint(TAINT_MACHINE_CHECK);

		__set_bit(i, toclear);

L
Linus Torvalds 已提交
501
		if (m.status & MCI_STATUS_EN) {
502 503 504 505 506
			/*
			 * If this error was uncorrectable and there was
			 * an overflow, we're in trouble.  If no overflow,
			 * we might get away with just killing a task.
			 */
507
			if (m.status & MCI_STATUS_UC)
508
				kill_it = 1;
509 510 511 512 513 514
		} else {
			/*
			 * Machine check event was not enabled. Clear, but
			 * ignore.
			 */
			continue;
L
Linus Torvalds 已提交
515 516 517
		}

		if (m.status & MCI_STATUS_MISCV)
518
			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
L
Linus Torvalds 已提交
519
		if (m.status & MCI_STATUS_ADDRV)
520
			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
L
Linus Torvalds 已提交
521

522
		mce_get_rip(&m, regs);
523
		mce_log(&m);
L
Linus Torvalds 已提交
524

I
Ingo Molnar 已提交
525 526 527 528 529 530 531 532
		/*
		 * Did this bank cause the exception?
		 *
		 * Assume that the bank with uncorrectable errors did it,
		 * and that there is only a single one:
		 */
		if ((m.status & MCI_STATUS_UC) &&
					(m.status & MCI_STATUS_EN)) {
L
Linus Torvalds 已提交
533 534 535 536 537
			panicm = m;
			panicm_found = 1;
		}
	}

I
Ingo Molnar 已提交
538 539 540 541
	/*
	 * If we didn't find an uncorrectable error, pick
	 * the last one (shouldn't happen, just being safe).
	 */
L
Linus Torvalds 已提交
542 543
	if (!panicm_found)
		panicm = m;
544 545 546

	/*
	 * If we have decided that we just CAN'T continue, and the user
I
Ingo Molnar 已提交
547
	 * has not set tolerant to an insane level, give up and die.
548 549
	 */
	if (no_way_out && tolerant < 3)
550
		mce_panic("Machine check", &panicm, msg);
551 552 553 554 555 556 557 558

	/*
	 * If the error seems to be unrecoverable, something should be
	 * done.  Try to kill as little as possible.  If we can kill just
	 * one task, do that.  If the user has set the tolerance very
	 * high, don't try to do anything at all.
	 */
	if (kill_it && tolerant < 3) {
L
Linus Torvalds 已提交
559 560
		int user_space = 0;

561 562 563 564 565
		/*
		 * If the EIPV bit is set, it means the saved IP is the
		 * instruction which caused the MCE.
		 */
		if (m.mcgstatus & MCG_STATUS_EIPV)
566
			user_space = panicm.ip && (panicm.cs & 3);
567 568 569 570 571

		/*
		 * If we know that the error was in user space, send a
		 * SIGBUS.  Otherwise, panic if tolerance is low.
		 *
572
		 * force_sig() takes an awful lot of locks and has a slight
573 574 575
		 * risk of deadlocking.
		 */
		if (user_space) {
576
			force_sig(SIGBUS, current);
577
		} else if (panic_on_oops || tolerant < 2) {
578
			mce_panic("Uncorrected machine check", &panicm, msg);
579
		}
L
Linus Torvalds 已提交
580 581
	}

582 583 584
	/* notify userspace ASAP */
	set_thread_flag(TIF_MCE_NOTIFY);

585 586
	mce_report_event(regs);

587
	/* the last thing we do is clear state */
588 589
	for (i = 0; i < banks; i++) {
		if (test_bit(i, toclear))
590
			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
591
	}
592
	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
593
out:
594
	atomic_dec(&mce_entry);
595
	sync_core();
L
Linus Torvalds 已提交
596
}
597
EXPORT_SYMBOL_GPL(do_machine_check);
L
Linus Torvalds 已提交
598

599 600 601
#ifdef CONFIG_X86_MCE_INTEL
/***
 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
S
Simon Arlott 已提交
602
 * @cpu: The CPU on which the event occurred.
603 604 605 606 607 608 609 610 611 612
 * @status: Event status information
 *
 * This function should be called by the thermal interrupt after the
 * event has been processed and the decision was made to log the event
 * further.
 *
 * The status parameter will be saved to the 'status' field of 'struct mce'
 * and historically has been the register value of the
 * MSR_IA32_THERMAL_STATUS (Intel) msr.
 */
613
void mce_log_therm_throt_event(__u64 status)
614 615 616
{
	struct mce m;

617
	mce_setup(&m);
618 619 620 621 622 623
	m.bank = MCE_THERMAL_BANK;
	m.status = status;
	mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */

L
Linus Torvalds 已提交
624
/*
625 626 627
 * Periodic polling timer for "silent" machine check errors.  If the
 * poller finds an MCE, poll 2x faster.  When the poller finds no more
 * errors, poll 2x slower (up to check_interval seconds).
L
Linus Torvalds 已提交
628 629
 */
static int check_interval = 5 * 60; /* 5 minutes */
I
Ingo Molnar 已提交
630

631
static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
632
static DEFINE_PER_CPU(struct timer_list, mce_timer);
L
Linus Torvalds 已提交
633

634
static void mcheck_timer(unsigned long data)
L
Linus Torvalds 已提交
635
{
636
	struct timer_list *t = &per_cpu(mce_timer, data);
637
	int *n;
638 639 640

	WARN_ON(smp_processor_id() != data);

I
Ingo Molnar 已提交
641
	if (mce_available(&current_cpu_data)) {
642 643
		machine_check_poll(MCP_TIMESTAMP,
				&__get_cpu_var(mce_poll_banks));
I
Ingo Molnar 已提交
644
	}
L
Linus Torvalds 已提交
645 646

	/*
647 648
	 * Alert userspace if needed.  If we logged an MCE, reduce the
	 * polling interval, otherwise increase the polling interval.
L
Linus Torvalds 已提交
649
	 */
650
	n = &__get_cpu_var(next_interval);
651
	if (mce_notify_user())
652
		*n = max(*n/2, HZ/100);
653
	else
654
		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
655

656
	t->expires = jiffies + *n;
657
	add_timer(t);
658 659
}

660 661 662 663 664 665 666
static void mce_do_trigger(struct work_struct *work)
{
	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
}

static DECLARE_WORK(mce_trigger_work, mce_do_trigger);

667
/*
668 669 670
 * Notify the user(s) about new machine check events.
 * Can be called from interrupt context, but not from machine check/NMI
 * context.
671 672 673
 */
int mce_notify_user(void)
{
674 675 676
	/* Not more than two messages every minute */
	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);

677
	clear_thread_flag(TIF_MCE_NOTIFY);
I
Ingo Molnar 已提交
678

679 680
	if (test_and_clear_bit(0, &notify_user)) {
		wake_up_interruptible(&mce_wait);
681 682 683 684 685 686 687 688

		/*
		 * There is no risk of missing notifications because
		 * work_pending is always cleared before the function is
		 * executed.
		 */
		if (trigger[0] && !work_pending(&mce_trigger_work))
			schedule_work(&mce_trigger_work);
689

690
		if (__ratelimit(&ratelimit))
691
			printk(KERN_INFO "Machine check events logged\n");
692 693

		return 1;
L
Linus Torvalds 已提交
694
	}
695 696
	return 0;
}
697
EXPORT_SYMBOL_GPL(mce_notify_user);
698

699
/*
L
Linus Torvalds 已提交
700 701
 * Initialize Machine Checks for a CPU.
 */
702
static int mce_cap_init(void)
L
Linus Torvalds 已提交
703
{
704
	unsigned b;
I
Ingo Molnar 已提交
705
	u64 cap;
L
Linus Torvalds 已提交
706 707

	rdmsrl(MSR_IA32_MCG_CAP, cap);
708 709

	b = cap & MCG_BANKCNT_MASK;
710 711
	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);

712 713 714 715 716 717 718 719 720 721 722 723 724 725 726
	if (b > MAX_NR_BANKS) {
		printk(KERN_WARNING
		       "MCE: Using only %u machine check banks out of %u\n",
			MAX_NR_BANKS, b);
		b = MAX_NR_BANKS;
	}

	/* Don't support asymmetric configurations today */
	WARN_ON(banks != 0 && b != banks);
	banks = b;
	if (!bank) {
		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
		if (!bank)
			return -ENOMEM;
		memset(bank, 0xff, banks * sizeof(u64));
L
Linus Torvalds 已提交
727
	}
728

729
	/* Use accurate RIP reporting if available. */
730
	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
731
		rip_msr = MSR_IA32_MCG_EIP;
L
Linus Torvalds 已提交
732

733 734 735
	return 0;
}

736
static void mce_init(void)
737
{
I
Ingo Molnar 已提交
738
	mce_banks_t all_banks;
739 740 741
	u64 cap;
	int i;

742 743 744
	/*
	 * Log the machine checks left over from the previous reset.
	 */
745
	bitmap_fill(all_banks, MAX_NR_BANKS);
A
Andi Kleen 已提交
746
	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
L
Linus Torvalds 已提交
747 748 749

	set_in_cr4(X86_CR4_MCE);

750
	rdmsrl(MSR_IA32_MCG_CAP, cap);
L
Linus Torvalds 已提交
751 752 753 754
	if (cap & MCG_CTL_P)
		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);

	for (i = 0; i < banks; i++) {
755 756
		if (skip_bank_init(i))
			continue;
757
		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
L
Linus Torvalds 已提交
758
		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
759
	}
L
Linus Torvalds 已提交
760 761 762
}

/* Add per CPU specific workarounds here */
763
static void mce_cpu_quirks(struct cpuinfo_x86 *c)
764
{
L
Linus Torvalds 已提交
765
	/* This should be disabled by the BIOS, but isn't always */
766
	if (c->x86_vendor == X86_VENDOR_AMD) {
I
Ingo Molnar 已提交
767 768 769 770 771 772
		if (c->x86 == 15 && banks > 4) {
			/*
			 * disable GART TBL walk error reporting, which
			 * trips off incorrectly with the IOMMU & 3ware
			 * & Cerberus:
			 */
773
			clear_bit(10, (unsigned long *)&bank[4]);
I
Ingo Molnar 已提交
774 775 776 777 778 779
		}
		if (c->x86 <= 17 && mce_bootlog < 0) {
			/*
			 * Lots of broken BIOS around that don't clear them
			 * by default and leave crap in there. Don't log:
			 */
780
			mce_bootlog = 0;
I
Ingo Molnar 已提交
781
		}
782 783 784 785 786 787
		/*
		 * Various K7s with broken bank 0 around. Always disable
		 * by default.
		 */
		 if (c->x86 == 6)
			bank[0] = 0;
L
Linus Torvalds 已提交
788
	}
789

790 791 792 793 794 795 796 797 798 799 800 801 802
	if (c->x86_vendor == X86_VENDOR_INTEL) {
		/*
		 * SDM documents that on family 6 bank 0 should not be written
		 * because it aliases to another special BIOS controlled
		 * register.
		 * But it's not aliased anymore on model 0x1a+
		 * Don't ignore bank 0 completely because there could be a
		 * valid event later, merely don't write CTL0.
		 */

		if (c->x86 == 6 && c->x86_model < 0x1A)
			__set_bit(0, &dont_init_banks);
	}
803
}
L
Linus Torvalds 已提交
804

805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
{
	if (c->x86 != 5)
		return;
	switch (c->x86_vendor) {
	case X86_VENDOR_INTEL:
		if (mce_p5_enabled())
			intel_p5_mcheck_init(c);
		break;
	case X86_VENDOR_CENTAUR:
		winchip_mcheck_init(c);
		break;
	}
}

820
static void mce_cpu_features(struct cpuinfo_x86 *c)
L
Linus Torvalds 已提交
821 822 823 824 825
{
	switch (c->x86_vendor) {
	case X86_VENDOR_INTEL:
		mce_intel_feature_init(c);
		break;
826 827 828
	case X86_VENDOR_AMD:
		mce_amd_feature_init(c);
		break;
L
Linus Torvalds 已提交
829 830 831 832 833
	default:
		break;
	}
}

834 835 836
static void mce_init_timer(void)
{
	struct timer_list *t = &__get_cpu_var(mce_timer);
837
	int *n = &__get_cpu_var(next_interval);
838

839 840
	*n = check_interval * HZ;
	if (!*n)
841 842
		return;
	setup_timer(t, mcheck_timer, smp_processor_id());
843
	t->expires = round_jiffies(jiffies + *n);
844 845 846
	add_timer(t);
}

847
/*
L
Linus Torvalds 已提交
848
 * Called for each booted CPU to set up machine checks.
I
Ingo Molnar 已提交
849
 * Must be called with preempt off:
L
Linus Torvalds 已提交
850
 */
851
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
L
Linus Torvalds 已提交
852
{
853 854 855 856 857
	if (mce_disabled)
		return;

	mce_ancient_init(c);

858
	if (!mce_available(c))
L
Linus Torvalds 已提交
859 860
		return;

861
	if (mce_cap_init() < 0) {
862
		mce_disabled = 1;
863 864 865 866
		return;
	}
	mce_cpu_quirks(c);

867 868
	machine_check_vector = do_machine_check;

869
	mce_init();
L
Linus Torvalds 已提交
870
	mce_cpu_features(c);
871
	mce_init_timer();
L
Linus Torvalds 已提交
872 873 874 875 876 877
}

/*
 * Character device to read and clear the MCE log.
 */

T
Tim Hockin 已提交
878
static DEFINE_SPINLOCK(mce_state_lock);
I
Ingo Molnar 已提交
879 880
static int		open_count;		/* #times opened */
static int		open_exclu;		/* already open exclusive? */
T
Tim Hockin 已提交
881 882 883 884 885 886 887

static int mce_open(struct inode *inode, struct file *file)
{
	spin_lock(&mce_state_lock);

	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
		spin_unlock(&mce_state_lock);
I
Ingo Molnar 已提交
888

T
Tim Hockin 已提交
889 890 891 892 893 894 895 896 897
		return -EBUSY;
	}

	if (file->f_flags & O_EXCL)
		open_exclu = 1;
	open_count++;

	spin_unlock(&mce_state_lock);

898
	return nonseekable_open(inode, file);
T
Tim Hockin 已提交
899 900 901 902 903 904 905 906 907 908 909 910 911 912
}

static int mce_release(struct inode *inode, struct file *file)
{
	spin_lock(&mce_state_lock);

	open_count--;
	open_exclu = 0;

	spin_unlock(&mce_state_lock);

	return 0;
}

913 914
static void collect_tscs(void *data)
{
L
Linus Torvalds 已提交
915
	unsigned long *cpu_tsc = (unsigned long *)data;
916

L
Linus Torvalds 已提交
917
	rdtscll(cpu_tsc[smp_processor_id()]);
918
}
L
Linus Torvalds 已提交
919

I
Ingo Molnar 已提交
920 921
static DEFINE_MUTEX(mce_read_mutex);

922 923
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
			loff_t *off)
L
Linus Torvalds 已提交
924
{
I
Ingo Molnar 已提交
925
	char __user *buf = ubuf;
926
	unsigned long *cpu_tsc;
927
	unsigned prev, next;
L
Linus Torvalds 已提交
928 929
	int i, err;

930
	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
931 932 933
	if (!cpu_tsc)
		return -ENOMEM;

934
	mutex_lock(&mce_read_mutex);
L
Linus Torvalds 已提交
935 936 937
	next = rcu_dereference(mcelog.next);

	/* Only supports full reads right now */
938
	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
939
		mutex_unlock(&mce_read_mutex);
940
		kfree(cpu_tsc);
I
Ingo Molnar 已提交
941

L
Linus Torvalds 已提交
942 943 944 945
		return -EINVAL;
	}

	err = 0;
946 947 948 949 950 951 952 953 954 955 956 957
	prev = 0;
	do {
		for (i = prev; i < next; i++) {
			unsigned long start = jiffies;

			while (!mcelog.entry[i].finished) {
				if (time_after_eq(jiffies, start + 2)) {
					memset(mcelog.entry + i, 0,
					       sizeof(struct mce));
					goto timeout;
				}
				cpu_relax();
958
			}
959 960 961 962 963 964
			smp_rmb();
			err |= copy_to_user(buf, mcelog.entry + i,
					    sizeof(struct mce));
			buf += sizeof(struct mce);
timeout:
			;
965
		}
L
Linus Torvalds 已提交
966

967 968 969 970 971
		memset(mcelog.entry + prev, 0,
		       (next - prev) * sizeof(struct mce));
		prev = next;
		next = cmpxchg(&mcelog.next, prev, 0);
	} while (next != prev);
L
Linus Torvalds 已提交
972

973
	synchronize_sched();
L
Linus Torvalds 已提交
974

975 976 977 978
	/*
	 * Collect entries that were still getting written before the
	 * synchronize.
	 */
979
	on_each_cpu(collect_tscs, cpu_tsc, 1);
I
Ingo Molnar 已提交
980

981 982 983 984 985
	for (i = next; i < MCE_LOG_LEN; i++) {
		if (mcelog.entry[i].finished &&
		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
			err |= copy_to_user(buf, mcelog.entry+i,
					    sizeof(struct mce));
L
Linus Torvalds 已提交
986 987 988 989
			smp_rmb();
			buf += sizeof(struct mce);
			memset(&mcelog.entry[i], 0, sizeof(struct mce));
		}
990
	}
991
	mutex_unlock(&mce_read_mutex);
992
	kfree(cpu_tsc);
I
Ingo Molnar 已提交
993

994
	return err ? -EFAULT : buf - ubuf;
L
Linus Torvalds 已提交
995 996
}

997 998 999 1000 1001 1002 1003 1004
static unsigned int mce_poll(struct file *file, poll_table *wait)
{
	poll_wait(file, &mce_wait, wait);
	if (rcu_dereference(mcelog.next))
		return POLLIN | POLLRDNORM;
	return 0;
}

1005
static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
L
Linus Torvalds 已提交
1006 1007
{
	int __user *p = (int __user *)arg;
1008

L
Linus Torvalds 已提交
1009
	if (!capable(CAP_SYS_ADMIN))
1010
		return -EPERM;
I
Ingo Molnar 已提交
1011

L
Linus Torvalds 已提交
1012
	switch (cmd) {
1013
	case MCE_GET_RECORD_LEN:
L
Linus Torvalds 已提交
1014 1015
		return put_user(sizeof(struct mce), p);
	case MCE_GET_LOG_LEN:
1016
		return put_user(MCE_LOG_LEN, p);
L
Linus Torvalds 已提交
1017 1018
	case MCE_GETCLEAR_FLAGS: {
		unsigned flags;
1019 1020

		do {
L
Linus Torvalds 已提交
1021
			flags = mcelog.flags;
1022
		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
I
Ingo Molnar 已提交
1023

1024
		return put_user(flags, p);
L
Linus Torvalds 已提交
1025 1026
	}
	default:
1027 1028
		return -ENOTTY;
	}
L
Linus Torvalds 已提交
1029 1030
}

1031
/* Modified in mce-inject.c, so not static or const */
1032
struct file_operations mce_chrdev_ops = {
I
Ingo Molnar 已提交
1033 1034 1035 1036 1037
	.open			= mce_open,
	.release		= mce_release,
	.read			= mce_read,
	.poll			= mce_poll,
	.unlocked_ioctl		= mce_ioctl,
L
Linus Torvalds 已提交
1038
};
1039
EXPORT_SYMBOL_GPL(mce_chrdev_ops);
L
Linus Torvalds 已提交
1040 1041 1042 1043 1044 1045 1046

static struct miscdevice mce_log_device = {
	MISC_MCELOG_MINOR,
	"mcelog",
	&mce_chrdev_ops,
};

H
Hidetoshi Seto 已提交
1047 1048 1049 1050 1051 1052
/*
 * mce=off disables machine check
 * mce=TOLERANCELEVEL (number, see above)
 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 * mce=nobootlog Don't log MCEs from before booting.
 */
L
Linus Torvalds 已提交
1053 1054
static int __init mcheck_enable(char *str)
{
1055 1056 1057 1058
	if (*str == 0)
		enable_p5_mce();
	if (*str == '=')
		str++;
L
Linus Torvalds 已提交
1059
	if (!strcmp(str, "off"))
1060
		mce_disabled = 1;
H
Hidetoshi Seto 已提交
1061 1062
	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
		mce_bootlog = (str[0] == 'b');
1063 1064
	else if (isdigit(str[0]))
		get_option(&str, &tolerant);
H
Hidetoshi Seto 已提交
1065
	else {
1066
		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
H
Hidetoshi Seto 已提交
1067 1068 1069
		       str);
		return 0;
	}
1070
	return 1;
L
Linus Torvalds 已提交
1071
}
1072
__setup("mce", mcheck_enable);
L
Linus Torvalds 已提交
1073

1074
/*
L
Linus Torvalds 已提交
1075
 * Sysfs support
1076
 */
L
Linus Torvalds 已提交
1077

1078 1079 1080 1081 1082 1083 1084 1085
/*
 * Disable machine checks on suspend and shutdown. We can't really handle
 * them later.
 */
static int mce_disable(void)
{
	int i;

1086 1087 1088 1089
	for (i = 0; i < banks; i++) {
		if (!skip_bank_init(i))
			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
	}
1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102
	return 0;
}

static int mce_suspend(struct sys_device *dev, pm_message_t state)
{
	return mce_disable();
}

static int mce_shutdown(struct sys_device *dev)
{
	return mce_disable();
}

I
Ingo Molnar 已提交
1103 1104 1105 1106 1107
/*
 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 * Only one CPU is active at this time, the others get re-added later using
 * CPU hotplug:
 */
L
Linus Torvalds 已提交
1108 1109
static int mce_resume(struct sys_device *dev)
{
1110
	mce_init();
1111
	mce_cpu_features(&current_cpu_data);
I
Ingo Molnar 已提交
1112

L
Linus Torvalds 已提交
1113 1114 1115
	return 0;
}

1116 1117 1118 1119
static void mce_cpu_restart(void *data)
{
	del_timer_sync(&__get_cpu_var(mce_timer));
	if (mce_available(&current_cpu_data))
1120
		mce_init();
1121 1122 1123
	mce_init_timer();
}

L
Linus Torvalds 已提交
1124
/* Reinit MCEs after user configuration changes */
1125 1126
static void mce_restart(void)
{
1127
	on_each_cpu(mce_cpu_restart, NULL, 1);
L
Linus Torvalds 已提交
1128 1129 1130
}

static struct sysdev_class mce_sysclass = {
I
Ingo Molnar 已提交
1131 1132 1133 1134
	.suspend	= mce_suspend,
	.shutdown	= mce_shutdown,
	.resume		= mce_resume,
	.name		= "machinecheck",
L
Linus Torvalds 已提交
1135 1136
};

I
Ingo Molnar 已提交
1137
DEFINE_PER_CPU(struct sys_device, mce_dev);
I
Ingo Molnar 已提交
1138 1139 1140

__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
L
Linus Torvalds 已提交
1141

1142 1143 1144 1145 1146 1147
static struct sysdev_attribute *bank_attrs;

static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
			 char *buf)
{
	u64 b = bank[attr - bank_attrs];
I
Ingo Molnar 已提交
1148

1149
	return sprintf(buf, "%llx\n", b);
1150 1151 1152
}

static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
H
Hidetoshi Seto 已提交
1153
			const char *buf, size_t size)
1154
{
H
Hidetoshi Seto 已提交
1155
	u64 new;
I
Ingo Molnar 已提交
1156

H
Hidetoshi Seto 已提交
1157
	if (strict_strtoull(buf, 0, &new) < 0)
1158
		return -EINVAL;
I
Ingo Molnar 已提交
1159

1160 1161
	bank[attr - bank_attrs] = new;
	mce_restart();
I
Ingo Molnar 已提交
1162

H
Hidetoshi Seto 已提交
1163
	return size;
1164
}
1165

I
Ingo Molnar 已提交
1166 1167
static ssize_t
show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1168 1169 1170 1171 1172 1173
{
	strcpy(buf, trigger);
	strcat(buf, "\n");
	return strlen(trigger) + 1;
}

1174
static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
I
Ingo Molnar 已提交
1175
				const char *buf, size_t siz)
1176 1177 1178
{
	char *p;
	int len;
I
Ingo Molnar 已提交
1179

1180 1181 1182 1183
	strncpy(trigger, buf, sizeof(trigger));
	trigger[sizeof(trigger)-1] = 0;
	len = strlen(trigger);
	p = strchr(trigger, '\n');
I
Ingo Molnar 已提交
1184 1185 1186 1187

	if (*p)
		*p = 0;

1188 1189 1190
	return len;
}

1191 1192 1193 1194 1195 1196 1197 1198 1199
static ssize_t store_int_with_restart(struct sys_device *s,
				      struct sysdev_attribute *attr,
				      const char *buf, size_t size)
{
	ssize_t ret = sysdev_store_int(s, attr, buf, size);
	mce_restart();
	return ret;
}

1200
static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1201
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
I
Ingo Molnar 已提交
1202

1203 1204 1205 1206 1207
static struct sysdev_ext_attribute attr_check_interval = {
	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
		     store_int_with_restart),
	&check_interval
};
I
Ingo Molnar 已提交
1208

I
Ingo Molnar 已提交
1209
static struct sysdev_attribute *mce_attrs[] = {
1210
	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1211 1212
	NULL
};
L
Linus Torvalds 已提交
1213

I
Ingo Molnar 已提交
1214
static cpumask_var_t mce_dev_initialized;
1215

I
Ingo Molnar 已提交
1216
/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1217
static __cpuinit int mce_create_device(unsigned int cpu)
L
Linus Torvalds 已提交
1218 1219
{
	int err;
1220
	int i;
1221

A
Andreas Herrmann 已提交
1222
	if (!mce_available(&boot_cpu_data))
1223 1224
		return -EIO;

I
Ingo Molnar 已提交
1225 1226 1227
	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
	per_cpu(mce_dev, cpu).id	= cpu;
	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1228

I
Ingo Molnar 已提交
1229
	err = sysdev_register(&per_cpu(mce_dev, cpu));
1230 1231 1232
	if (err)
		return err;

I
Ingo Molnar 已提交
1233 1234
	for (i = 0; mce_attrs[i]; i++) {
		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1235 1236 1237
		if (err)
			goto error;
	}
1238
	for (i = 0; i < banks; i++) {
I
Ingo Molnar 已提交
1239
		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1240 1241 1242 1243
					&bank_attrs[i]);
		if (err)
			goto error2;
	}
I
Ingo Molnar 已提交
1244
	cpumask_set_cpu(cpu, mce_dev_initialized);
1245

1246
	return 0;
1247
error2:
I
Ingo Molnar 已提交
1248 1249
	while (--i >= 0)
		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1250
error:
I
Ingo Molnar 已提交
1251 1252 1253 1254
	while (--i >= 0)
		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);

	sysdev_unregister(&per_cpu(mce_dev, cpu));
1255

1256 1257 1258
	return err;
}

1259
static __cpuinit void mce_remove_device(unsigned int cpu)
1260
{
1261 1262
	int i;

I
Ingo Molnar 已提交
1263
	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1264 1265
		return;

I
Ingo Molnar 已提交
1266 1267 1268
	for (i = 0; mce_attrs[i]; i++)
		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);

1269
	for (i = 0; i < banks; i++)
I
Ingo Molnar 已提交
1270 1271 1272 1273
		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);

	sysdev_unregister(&per_cpu(mce_dev, cpu));
	cpumask_clear_cpu(cpu, mce_dev_initialized);
1274 1275
}

1276
/* Make sure there are no machine checks on offlined CPUs. */
1277
static void mce_disable_cpu(void *h)
1278
{
A
Andi Kleen 已提交
1279
	unsigned long action = *(unsigned long *)h;
I
Ingo Molnar 已提交
1280
	int i;
1281 1282 1283

	if (!mce_available(&current_cpu_data))
		return;
A
Andi Kleen 已提交
1284 1285
	if (!(action & CPU_TASKS_FROZEN))
		cmci_clear();
1286 1287 1288 1289
	for (i = 0; i < banks; i++) {
		if (!skip_bank_init(i))
			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
	}
1290 1291
}

1292
static void mce_reenable_cpu(void *h)
1293
{
A
Andi Kleen 已提交
1294
	unsigned long action = *(unsigned long *)h;
I
Ingo Molnar 已提交
1295
	int i;
1296 1297 1298

	if (!mce_available(&current_cpu_data))
		return;
I
Ingo Molnar 已提交
1299

A
Andi Kleen 已提交
1300 1301
	if (!(action & CPU_TASKS_FROZEN))
		cmci_reenable();
1302 1303 1304 1305
	for (i = 0; i < banks; i++) {
		if (!skip_bank_init(i))
			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
	}
1306 1307
}

1308
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
I
Ingo Molnar 已提交
1309 1310
static int __cpuinit
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1311 1312
{
	unsigned int cpu = (unsigned long)hcpu;
1313
	struct timer_list *t = &per_cpu(mce_timer, cpu);
1314 1315

	switch (action) {
1316 1317 1318
	case CPU_ONLINE:
	case CPU_ONLINE_FROZEN:
		mce_create_device(cpu);
1319 1320
		if (threshold_cpu_callback)
			threshold_cpu_callback(action, cpu);
1321 1322
		break;
	case CPU_DEAD:
1323
	case CPU_DEAD_FROZEN:
1324 1325
		if (threshold_cpu_callback)
			threshold_cpu_callback(action, cpu);
1326 1327
		mce_remove_device(cpu);
		break;
1328 1329 1330
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
		del_timer_sync(t);
A
Andi Kleen 已提交
1331
		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1332 1333 1334
		break;
	case CPU_DOWN_FAILED:
	case CPU_DOWN_FAILED_FROZEN:
1335 1336
		t->expires = round_jiffies(jiffies +
						__get_cpu_var(next_interval));
1337
		add_timer_on(t, cpu);
A
Andi Kleen 已提交
1338 1339 1340 1341 1342
		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
		break;
	case CPU_POST_DEAD:
		/* intentionally ignoring frozen here */
		cmci_rediscover(cpu);
1343
		break;
1344
	}
1345
	return NOTIFY_OK;
1346 1347
}

1348
static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1349 1350 1351
	.notifier_call = mce_cpu_callback,
};

1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
static __init int mce_init_banks(void)
{
	int i;

	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
				GFP_KERNEL);
	if (!bank_attrs)
		return -ENOMEM;

	for (i = 0; i < banks; i++) {
		struct sysdev_attribute *a = &bank_attrs[i];
I
Ingo Molnar 已提交
1363 1364

		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
1365 1366
		if (!a->attr.name)
			goto nomem;
I
Ingo Molnar 已提交
1367 1368 1369 1370

		a->attr.mode	= 0644;
		a->show		= show_bank;
		a->store	= set_bank;
1371 1372 1373 1374 1375 1376 1377 1378
	}
	return 0;

nomem:
	while (--i >= 0)
		kfree(bank_attrs[i].attr.name);
	kfree(bank_attrs);
	bank_attrs = NULL;
I
Ingo Molnar 已提交
1379

1380 1381 1382
	return -ENOMEM;
}

1383 1384 1385 1386 1387
static __init int mce_init_device(void)
{
	int err;
	int i = 0;

L
Linus Torvalds 已提交
1388 1389
	if (!mce_available(&boot_cpu_data))
		return -EIO;
1390

I
Ingo Molnar 已提交
1391
	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1392

1393 1394 1395 1396
	err = mce_init_banks();
	if (err)
		return err;

L
Linus Torvalds 已提交
1397
	err = sysdev_class_register(&mce_sysclass);
1398 1399
	if (err)
		return err;
1400 1401

	for_each_online_cpu(i) {
1402 1403 1404
		err = mce_create_device(i);
		if (err)
			return err;
1405 1406
	}

1407
	register_hotcpu_notifier(&mce_cpu_notifier);
L
Linus Torvalds 已提交
1408
	misc_register(&mce_log_device);
I
Ingo Molnar 已提交
1409

L
Linus Torvalds 已提交
1410 1411
	return err;
}
1412

L
Linus Torvalds 已提交
1413
device_initcall(mce_init_device);
I
Ingo Molnar 已提交
1414

1415
#else /* CONFIG_X86_OLD_MCE: */
I
Ingo Molnar 已提交
1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447

int nr_mce_banks;
EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */

/* This has to be run for each processor */
void mcheck_init(struct cpuinfo_x86 *c)
{
	if (mce_disabled == 1)
		return;

	switch (c->x86_vendor) {
	case X86_VENDOR_AMD:
		amd_mcheck_init(c);
		break;

	case X86_VENDOR_INTEL:
		if (c->x86 == 5)
			intel_p5_mcheck_init(c);
		if (c->x86 == 6)
			intel_p6_mcheck_init(c);
		if (c->x86 == 15)
			intel_p4_mcheck_init(c);
		break;

	case X86_VENDOR_CENTAUR:
		if (c->x86 == 5)
			winchip_mcheck_init(c);
		break;

	default:
		break;
	}
1448
	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
I
Ingo Molnar 已提交
1449 1450 1451 1452 1453 1454 1455 1456 1457 1458
}

static int __init mcheck_enable(char *str)
{
	mce_disabled = -1;
	return 1;
}

__setup("mce", mcheck_enable);

1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469
#endif /* CONFIG_X86_OLD_MCE */

/*
 * Old style boot options parsing. Only for compatibility.
 */
static int __init mcheck_disable(char *str)
{
	mce_disabled = 1;
	return 1;
}
__setup("nomce", mcheck_disable);