traps.c 26.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
3
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
L
Linus Torvalds 已提交
4 5 6 7 8 9
 *
 *  Pentium III FXSR, SSE support
 *	Gareth Hughes <gareth@valinux.com>, May 2000
 */

/*
10
 * Handle hardware traps and faults.
L
Linus Torvalds 已提交
11
 */
12 13 14

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

15
#include <linux/context_tracking.h>
I
Ingo Molnar 已提交
16 17 18 19 20 21
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
22
#include <linux/kgdb.h>
L
Linus Torvalds 已提交
23
#include <linux/kernel.h>
I
Ingo Molnar 已提交
24 25
#include <linux/module.h>
#include <linux/ptrace.h>
26
#include <linux/uprobes.h>
L
Linus Torvalds 已提交
27
#include <linux/string.h>
I
Ingo Molnar 已提交
28
#include <linux/delay.h>
L
Linus Torvalds 已提交
29
#include <linux/errno.h>
I
Ingo Molnar 已提交
30 31
#include <linux/kexec.h>
#include <linux/sched.h>
L
Linus Torvalds 已提交
32 33
#include <linux/timer.h>
#include <linux/init.h>
J
Jeremy Fitzhardinge 已提交
34
#include <linux/bug.h>
I
Ingo Molnar 已提交
35 36
#include <linux/nmi.h>
#include <linux/mm.h>
37 38
#include <linux/smp.h>
#include <linux/io.h>
L
Linus Torvalds 已提交
39 40 41 42 43 44

#ifdef CONFIG_EISA
#include <linux/ioport.h>
#include <linux/eisa.h>
#endif

D
Dave Jiang 已提交
45 46 47 48
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif

V
Vegard Nossum 已提交
49
#include <asm/kmemcheck.h>
I
Ingo Molnar 已提交
50
#include <asm/stacktrace.h>
L
Linus Torvalds 已提交
51 52
#include <asm/processor.h>
#include <asm/debugreg.h>
A
Arun Sharma 已提交
53
#include <linux/atomic.h>
54
#include <asm/ftrace.h>
55
#include <asm/traps.h>
L
Linus Torvalds 已提交
56 57
#include <asm/desc.h>
#include <asm/i387.h>
58
#include <asm/fpu-internal.h>
H
Hidetoshi Seto 已提交
59
#include <asm/mce.h>
60
#include <asm/fixmap.h>
61
#include <asm/mach_traps.h>
62
#include <asm/alternative.h>
63
#include <asm/mpx.h>
64

65
#ifdef CONFIG_X86_64
66
#include <asm/x86_init.h>
67 68
#include <asm/pgalloc.h>
#include <asm/proto.h>
K
Kees Cook 已提交
69 70 71

/* No need to be aligned, but done to keep all IDTs defined the same way. */
gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
72
#else
73
#include <asm/processor-flags.h>
I
Ingo Molnar 已提交
74
#include <asm/setup.h>
L
Linus Torvalds 已提交
75 76

asmlinkage int system_call(void);
77
#endif
L
Linus Torvalds 已提交
78

K
Kees Cook 已提交
79 80 81
/* Must be page-aligned because the real IDT is used in a fixmap. */
gate_desc idt_table[NR_VECTORS] __page_aligned_bss;

82 83 84
DECLARE_BITMAP(used_vectors, NR_VECTORS);
EXPORT_SYMBOL_GPL(used_vectors);

85 86 87 88 89 90
static inline void conditional_sti(struct pt_regs *regs)
{
	if (regs->flags & X86_EFLAGS_IF)
		local_irq_enable();
}

91 92
static inline void preempt_conditional_sti(struct pt_regs *regs)
{
93
	preempt_count_inc();
94 95 96 97
	if (regs->flags & X86_EFLAGS_IF)
		local_irq_enable();
}

T
Thomas Gleixner 已提交
98 99 100 101 102 103
static inline void conditional_cli(struct pt_regs *regs)
{
	if (regs->flags & X86_EFLAGS_IF)
		local_irq_disable();
}

104 105 106 107
static inline void preempt_conditional_cli(struct pt_regs *regs)
{
	if (regs->flags & X86_EFLAGS_IF)
		local_irq_disable();
108
	preempt_count_dec();
109 110
}

111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
enum ctx_state ist_enter(struct pt_regs *regs)
{
	/*
	 * We are atomic because we're on the IST stack (or we're on x86_32,
	 * in which case we still shouldn't schedule.
	 */
	preempt_count_add(HARDIRQ_OFFSET);

	if (user_mode_vm(regs)) {
		/* Other than that, we're just an exception. */
		return exception_enter();
	} else {
		/*
		 * We might have interrupted pretty much anything.  In
		 * fact, if we're a machine check, we can even interrupt
		 * NMI processing.  We don't want in_nmi() to return true,
		 * but we need to notify RCU.
		 */
		rcu_nmi_enter();
		return IN_KERNEL;  /* the value is irrelevant. */
	}
}

void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
{
	preempt_count_sub(HARDIRQ_OFFSET);

	if (user_mode_vm(regs))
		return exception_exit(prev_state);
	else
		rcu_nmi_exit();
}

144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
/**
 * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
 * @regs:	regs passed to the IST exception handler
 *
 * IST exception handlers normally cannot schedule.  As a special
 * exception, if the exception interrupted userspace code (i.e.
 * user_mode_vm(regs) would return true) and the exception was not
 * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
 * begins a non-atomic section within an ist_enter()/ist_exit() region.
 * Callers are responsible for enabling interrupts themselves inside
 * the non-atomic section, and callers must call is_end_non_atomic()
 * before ist_exit().
 */
void ist_begin_non_atomic(struct pt_regs *regs)
{
	BUG_ON(!user_mode_vm(regs));

	/*
	 * Sanity check: we need to be on the normal thread stack.  This
	 * will catch asm bugs and any attempt to use ist_preempt_enable
	 * from double_fault.
	 */
	BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
		& ~(THREAD_SIZE - 1)) != 0);

	preempt_count_sub(HARDIRQ_OFFSET);
}

/**
 * ist_end_non_atomic() - begin a non-atomic section in an IST exception
 *
 * Ends a non-atomic section started with ist_begin_non_atomic().
 */
void ist_end_non_atomic(void)
{
	preempt_count_add(HARDIRQ_OFFSET);
}

182
static nokprobe_inline int
183 184
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
		  struct pt_regs *regs,	long error_code)
L
Linus Torvalds 已提交
185
{
186
#ifdef CONFIG_X86_32
187
	if (regs->flags & X86_VM_MASK) {
188
		/*
189
		 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
190 191
		 * On nmi (interrupt 2), do_trap should not be called.
		 */
192 193 194 195 196 197
		if (trapnr < X86_TRAP_UD) {
			if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
						error_code, trapnr))
				return 0;
		}
		return -1;
L
Linus Torvalds 已提交
198
	}
199
#endif
200 201 202 203 204 205 206 207
	if (!user_mode(regs)) {
		if (!fixup_exception(regs)) {
			tsk->thread.error_code = error_code;
			tsk->thread.trap_nr = trapnr;
			die(str, regs, error_code);
		}
		return 0;
	}
L
Linus Torvalds 已提交
208

209 210
	return -1;
}
L
Linus Torvalds 已提交
211

212 213
static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
				siginfo_t *info)
214 215 216 217 218
{
	unsigned long siaddr;
	int sicode;

	switch (trapnr) {
219 220 221
	default:
		return SEND_SIG_PRIV;

222 223
	case X86_TRAP_DE:
		sicode = FPE_INTDIV;
224
		siaddr = uprobe_get_trap_addr(regs);
225 226 227
		break;
	case X86_TRAP_UD:
		sicode = ILL_ILLOPN;
228
		siaddr = uprobe_get_trap_addr(regs);
229 230 231 232 233 234 235 236 237 238 239
		break;
	case X86_TRAP_AC:
		sicode = BUS_ADRALN;
		siaddr = 0;
		break;
	}

	info->si_signo = signr;
	info->si_errno = 0;
	info->si_code = sicode;
	info->si_addr = (void __user *)siaddr;
240
	return info;
241 242
}

243
static void
244 245 246 247 248 249 250 251
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
	long error_code, siginfo_t *info)
{
	struct task_struct *tsk = current;


	if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
		return;
I
Ingo Molnar 已提交
252
	/*
253
	 * We want error_code and trap_nr set for userspace faults and
I
Ingo Molnar 已提交
254 255 256 257 258 259 260 261
	 * kernelspace faults which result in die(), but not
	 * kernelspace faults which are fixed up.  die() gives the
	 * process no chance to handle the signal and notice the
	 * kernel fault information, so that won't result in polluting
	 * the information about previously queued, but not yet
	 * delivered, faults.  See also do_general_protection below.
	 */
	tsk->thread.error_code = error_code;
262
	tsk->thread.trap_nr = trapnr;
263

264 265 266
#ifdef CONFIG_X86_64
	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
	    printk_ratelimit()) {
267 268 269
		pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
			tsk->comm, tsk->pid, str,
			regs->ip, regs->sp, error_code);
270
		print_vma_addr(" in ", regs->ip);
271
		pr_cont("\n");
272 273 274
	}
#endif

275
	force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
L
Linus Torvalds 已提交
276
}
277
NOKPROBE_SYMBOL(do_trap);
L
Linus Torvalds 已提交
278

279
static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
280
			  unsigned long trapnr, int signr)
281 282
{
	enum ctx_state prev_state = exception_enter();
283
	siginfo_t info;
284 285 286 287

	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
			NOTIFY_STOP) {
		conditional_sti(regs);
288 289
		do_trap(trapnr, signr, str, regs, error_code,
			fill_trap_info(regs, signr, trapnr, &info));
290 291 292 293 294
	}

	exception_exit(prev_state);
}

I
Ingo Molnar 已提交
295
#define DO_ERROR(trapnr, signr, str, name)				\
296
dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
I
Ingo Molnar 已提交
297
{									\
298
	do_error_trap(regs, error_code, str, trapnr, signr);		\
L
Linus Torvalds 已提交
299 300
}

O
Oleg Nesterov 已提交
301 302 303 304 305 306 307 308
DO_ERROR(X86_TRAP_DE,     SIGFPE,  "divide error",		divide_error)
DO_ERROR(X86_TRAP_OF,     SIGSEGV, "overflow",			overflow)
DO_ERROR(X86_TRAP_UD,     SIGILL,  "invalid opcode",		invalid_op)
DO_ERROR(X86_TRAP_OLD_MF, SIGFPE,  "coprocessor segment overrun",coprocessor_segment_overrun)
DO_ERROR(X86_TRAP_TS,     SIGSEGV, "invalid TSS",		invalid_TSS)
DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present",	segment_not_present)
DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",		stack_segment)
DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",		alignment_check)
L
Linus Torvalds 已提交
309

310 311 312 313 314 315 316
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
	static const char str[] = "double fault";
	struct task_struct *tsk = current;

317 318 319 320 321 322 323 324
#ifdef CONFIG_X86_ESPFIX64
	extern unsigned char native_irq_return_iret[];

	/*
	 * If IRET takes a non-IST fault on the espfix64 stack, then we
	 * end up promoting it to a doublefault.  In that case, modify
	 * the stack to make it look like we just entered the #GP
	 * handler from user space, similar to bad_iret.
325 326
	 *
	 * No need for ist_enter here because we don't use RCU.
327 328 329 330 331 332 333 334 335 336 337 338
	 */
	if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
		regs->cs == __KERNEL_CS &&
		regs->ip == (unsigned long)native_irq_return_iret)
	{
		struct pt_regs *normal_regs = task_pt_regs(current);

		/* Fake a #GP(0) from userspace. */
		memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
		normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
		regs->ip = (unsigned long)general_protection;
		regs->sp = (unsigned long)&normal_regs->orig_ax;
339

340 341 342 343
		return;
	}
#endif

344
	ist_enter(regs);  /* Discard prev_state because we won't return. */
345
	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
346 347

	tsk->thread.error_code = error_code;
348
	tsk->thread.trap_nr = X86_TRAP_DF;
349

350 351 352
#ifdef CONFIG_DOUBLEFAULT
	df_debug(regs, error_code);
#endif
353 354 355 356
	/*
	 * This is always a kernel trap and never fixable (and thus must
	 * never return).
	 */
357 358 359 360 361
	for (;;)
		die(str, regs, error_code);
}
#endif

362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
	struct task_struct *tsk = current;
	struct xsave_struct *xsave_buf;
	enum ctx_state prev_state;
	struct bndcsr *bndcsr;
	siginfo_t *info;

	prev_state = exception_enter();
	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
		goto exit;
	conditional_sti(regs);

	if (!user_mode(regs))
		die("bounds", regs, error_code);

	if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
		/* The exception is not from Intel MPX */
		goto exit_trap;
	}

	/*
	 * We need to look at BNDSTATUS to resolve this exception.
	 * It is not directly accessible, though, so we need to
	 * do an xsave and then pull it out of the xsave buffer.
	 */
	fpu_save_init(&tsk->thread.fpu);
	xsave_buf = &(tsk->thread.fpu.state->xsave);
	bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
	if (!bndcsr)
		goto exit_trap;

	/*
	 * The error code field of the BNDSTATUS register communicates status
	 * information of a bound range exception #BR or operation involving
	 * bound directory.
	 */
	switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
	case 2:	/* Bound directory has invalid entry. */
		if (mpx_handle_bd_fault(xsave_buf))
			goto exit_trap;
		break; /* Success, it was handled */
	case 1: /* Bound violation. */
		info = mpx_generate_siginfo(regs, xsave_buf);
407
		if (IS_ERR(info)) {
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
			/*
			 * We failed to decode the MPX instruction.  Act as if
			 * the exception was not caused by MPX.
			 */
			goto exit_trap;
		}
		/*
		 * Success, we decoded the instruction and retrieved
		 * an 'info' containing the address being accessed
		 * which caused the exception.  This information
		 * allows and application to possibly handle the
		 * #BR exception itself.
		 */
		do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
		kfree(info);
		break;
	case 0: /* No exception caused by Intel MPX operations. */
		goto exit_trap;
	default:
		die("bounds", regs, error_code);
	}

exit:
	exception_exit(prev_state);
	return;
exit_trap:
	/*
	 * This path out is for all the cases where we could not
	 * handle the exception in some way (like allocating a
	 * table or telling userspace about it.  We will also end
	 * up here if the kernel has MPX turned off at compile
	 * time..
	 */
	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
	exception_exit(prev_state);
}

445
dotraplinkage void
446
do_general_protection(struct pt_regs *regs, long error_code)
L
Linus Torvalds 已提交
447
{
448
	struct task_struct *tsk;
449
	enum ctx_state prev_state;
I
Ingo Molnar 已提交
450

451
	prev_state = exception_enter();
452 453
	conditional_sti(regs);

454
#ifdef CONFIG_X86_32
455 456 457
	if (regs->flags & X86_VM_MASK) {
		local_irq_enable();
		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
458
		goto exit;
459
	}
460
#endif
L
Linus Torvalds 已提交
461

462
	tsk = current;
463 464
	if (!user_mode(regs)) {
		if (fixup_exception(regs))
465
			goto exit;
466 467 468

		tsk->thread.error_code = error_code;
		tsk->thread.trap_nr = X86_TRAP_GP;
469 470
		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
471
			die("general protection fault", regs, error_code);
472
		goto exit;
473
	}
L
Linus Torvalds 已提交
474

475
	tsk->thread.error_code = error_code;
476
	tsk->thread.trap_nr = X86_TRAP_GP;
I
Ingo Molnar 已提交
477

478 479
	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
			printk_ratelimit()) {
480
		pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
481 482
			tsk->comm, task_pid_nr(tsk),
			regs->ip, regs->sp, error_code);
483
		print_vma_addr(" in ", regs->ip);
484
		pr_cont("\n");
485
	}
486

487
	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
488
exit:
489
	exception_exit(prev_state);
L
Linus Torvalds 已提交
490
}
491
NOKPROBE_SYMBOL(do_general_protection);
L
Linus Torvalds 已提交
492

493
/* May run on IST stack. */
494
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
L
Linus Torvalds 已提交
495
{
496 497
	enum ctx_state prev_state;

498
#ifdef CONFIG_DYNAMIC_FTRACE
499 500 501 502 503 504
	/*
	 * ftrace must be first, everything else may cause a recursive crash.
	 * See note by declaration of modifying_ftrace_code in ftrace.c
	 */
	if (unlikely(atomic_read(&modifying_ftrace_code)) &&
	    ftrace_int3_handler(regs))
505 506
		return;
#endif
507 508 509
	if (poke_int3_handler(regs))
		return;

510
	prev_state = ist_enter(regs);
511
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
512 513
	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
				SIGTRAP) == NOTIFY_STOP)
514
		goto exit;
515
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
516

517 518
#ifdef CONFIG_KPROBES
	if (kprobe_int3_handler(regs))
519
		goto exit;
520 521
#endif

522 523
	if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
			SIGTRAP) == NOTIFY_STOP)
524
		goto exit;
I
Ingo Molnar 已提交
525

526 527 528 529 530
	/*
	 * Let others (NMI) know that the debug stack is in use
	 * as we may switch to the interrupt stack.
	 */
	debug_stack_usage_inc();
531
	preempt_conditional_sti(regs);
532
	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
533
	preempt_conditional_cli(regs);
534
	debug_stack_usage_dec();
535
exit:
536
	ist_exit(regs, prev_state);
L
Linus Torvalds 已提交
537
}
538
NOKPROBE_SYMBOL(do_int3);
L
Linus Torvalds 已提交
539

540
#ifdef CONFIG_X86_64
541
/*
542 543 544
 * Help handler running on IST stack to switch off the IST stack if the
 * interrupted code was in user mode. The actual stack switch is done in
 * entry_64.S
545
 */
546
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
547
{
548 549
	struct pt_regs *regs = task_pt_regs(current);
	*regs = *eregs;
550 551
	return regs;
}
552
NOKPROBE_SYMBOL(sync_regs);
A
Andy Lutomirski 已提交
553 554 555 556 557 558

struct bad_iret_stack {
	void *error_entry_ret;
	struct pt_regs regs;
};

559
asmlinkage __visible notrace
A
Andy Lutomirski 已提交
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
{
	/*
	 * This is called from entry_64.S early in handling a fault
	 * caused by a bad iret to user mode.  To handle the fault
	 * correctly, we want move our stack frame to task_pt_regs
	 * and we want to pretend that the exception came from the
	 * iret target.
	 */
	struct bad_iret_stack *new_stack =
		container_of(task_pt_regs(current),
			     struct bad_iret_stack, regs);

	/* Copy the IRET target to the new stack. */
	memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);

	/* Copy the remainder of the stack from the current stack. */
	memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));

	BUG_ON(!user_mode_vm(&new_stack->regs));
	return new_stack;
}
582
NOKPROBE_SYMBOL(fixup_bad_iret);
583 584
#endif

L
Linus Torvalds 已提交
585 586 587 588 589 590 591 592 593 594
/*
 * Our handling of the processor debug registers is non-trivial.
 * We do not clear them on entry and exit from the kernel. Therefore
 * it is possible to get a watchpoint trap here from inside the kernel.
 * However, the code in ./ptrace.c has ensured that the user can
 * only set watchpoints on userspace addresses. Therefore the in-kernel
 * watchpoint trap can only occur in code which is reading/writing
 * from user space. Such code must not hold kernel locks (since it
 * can equally take a page fault), therefore it is safe to call
 * force_sig_info even though that claims and releases locks.
I
Ingo Molnar 已提交
595
 *
L
Linus Torvalds 已提交
596 597 598 599 600 601 602 603 604 605
 * Code in ./signal.c ensures that the debug control register
 * is restored before we deliver any signal, and therefore that
 * user code runs with the correct debug control register even though
 * we clear it here.
 *
 * Being careful here means that we don't have to be as careful in a
 * lot of more complicated places (task switching can be a bit lazy
 * about restoring all the debug state, and ptrace doesn't have to
 * find every occurrence of the TF bit that could be saved away even
 * by user code)
606 607
 *
 * May run on IST stack.
L
Linus Torvalds 已提交
608
 */
609
dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
L
Linus Torvalds 已提交
610 611
{
	struct task_struct *tsk = current;
612
	enum ctx_state prev_state;
613
	int user_icebp = 0;
614
	unsigned long dr6;
615
	int si_code;
L
Linus Torvalds 已提交
616

617
	prev_state = ist_enter(regs);
618

619
	get_debugreg(dr6, 6);
L
Linus Torvalds 已提交
620

621 622 623
	/* Filter out all the reserved bits which are preset to 1 */
	dr6 &= ~DR6_RESERVED;

624 625 626 627 628 629 630 631
	/*
	 * If dr6 has no reason to give us about the origin of this trap,
	 * then it's very likely the result of an icebp/int01 trap.
	 * User wants a sigtrap for that.
	 */
	if (!dr6 && user_mode(regs))
		user_icebp = 1;

V
Vegard Nossum 已提交
632
	/* Catch kmemcheck conditions first of all! */
633
	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
634
		goto exit;
V
Vegard Nossum 已提交
635

636 637
	/* DR6 may or may not be cleared by the CPU */
	set_debugreg(0, 6);
638

P
Peter Zijlstra 已提交
639 640 641 642 643
	/*
	 * The processor cleared BTF, so don't mark that we need it set.
	 */
	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);

644 645 646
	/* Store the virtualized DR6 value */
	tsk->thread.debugreg6 = dr6;

647 648 649 650 651
#ifdef CONFIG_KPROBES
	if (kprobe_debug_handler(regs))
		goto exit;
#endif

652
	if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
653
							SIGTRAP) == NOTIFY_STOP)
654
		goto exit;
655

656 657 658 659 660 661
	/*
	 * Let others (NMI) know that the debug stack is in use
	 * as we may switch to the interrupt stack.
	 */
	debug_stack_usage_inc();

L
Linus Torvalds 已提交
662
	/* It's safe to allow irq's after DR6 has been saved */
663
	preempt_conditional_sti(regs);
L
Linus Torvalds 已提交
664

665
	if (regs->flags & X86_VM_MASK) {
666 667
		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
					X86_TRAP_DB);
668
		preempt_conditional_cli(regs);
669
		debug_stack_usage_dec();
670
		goto exit;
L
Linus Torvalds 已提交
671 672 673
	}

	/*
674 675 676 677 678
	 * Single-stepping through system calls: ignore any exceptions in
	 * kernel space, but re-enable TF when returning to user mode.
	 *
	 * We already checked v86 mode above, so we can check for kernel mode
	 * by just checking the CPL of CS.
L
Linus Torvalds 已提交
679
	 */
680 681 682 683
	if ((dr6 & DR_STEP) && !user_mode(regs)) {
		tsk->thread.debugreg6 &= ~DR_STEP;
		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
		regs->flags &= ~X86_EFLAGS_TF;
L
Linus Torvalds 已提交
684
	}
685
	si_code = get_si_code(tsk->thread.debugreg6);
686
	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
687
		send_sigtrap(tsk, regs, error_code, si_code);
688
	preempt_conditional_cli(regs);
689
	debug_stack_usage_dec();
L
Linus Torvalds 已提交
690

691
exit:
692
	ist_exit(regs, prev_state);
L
Linus Torvalds 已提交
693
}
694
NOKPROBE_SYMBOL(do_debug);
L
Linus Torvalds 已提交
695 696 697 698 699 700

/*
 * Note that we play around with the 'TS' bit in an attempt to get
 * the correct behaviour even in the presence of the asynchronous
 * IRQ13 behaviour
 */
701
static void math_error(struct pt_regs *regs, int error_code, int trapnr)
L
Linus Torvalds 已提交
702
{
703
	struct task_struct *task = current;
L
Linus Torvalds 已提交
704
	siginfo_t info;
705
	unsigned short err;
706 707
	char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
						"simd exception";
708 709 710 711 712 713 714 715 716

	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
		return;
	conditional_sti(regs);

	if (!user_mode_vm(regs))
	{
		if (!fixup_exception(regs)) {
			task->thread.error_code = error_code;
717
			task->thread.trap_nr = trapnr;
718 719 720 721
			die(str, regs, error_code);
		}
		return;
	}
L
Linus Torvalds 已提交
722 723 724 725 726

	/*
	 * Save the info for the exception handler and clear the error.
	 */
	save_init_fpu(task);
727
	task->thread.trap_nr = trapnr;
728
	task->thread.error_code = error_code;
L
Linus Torvalds 已提交
729 730
	info.si_signo = SIGFPE;
	info.si_errno = 0;
731
	info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
732
	if (trapnr == X86_TRAP_MF) {
733 734 735 736 737 738 739 740 741 742 743 744 745
		unsigned short cwd, swd;
		/*
		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
		 * status.  0x3f is the exception bits in these regs, 0x200 is the
		 * C1 reg you need in case of a stack fault, 0x040 is the stack
		 * fault bit.  We should only be taking one exception at a time,
		 * so if this combination doesn't produce any single exception,
		 * then we have a bad program that isn't synchronizing its FPU usage
		 * and it will suffer the consequences since we won't be able to
		 * fully reproduce the context of the exception
		 */
		cwd = get_fpu_cwd(task);
		swd = get_fpu_swd(task);
746

747 748 749 750 751 752 753 754 755 756 757
		err = swd & ~cwd;
	} else {
		/*
		 * The SIMD FPU exceptions are handled a little differently, as there
		 * is only a single status/control register.  Thus, to determine which
		 * unmasked exception was caught we must mask the exception mask bits
		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
		 */
		unsigned short mxcsr = get_fpu_mxcsr(task);
		err = ~(mxcsr >> 7) & mxcsr;
	}
758 759

	if (err & 0x001) {	/* Invalid op */
I
Ingo Molnar 已提交
760 761 762 763 764 765
		/*
		 * swd & 0x240 == 0x040: Stack Underflow
		 * swd & 0x240 == 0x240: Stack Overflow
		 * User must clear the SF bit (0x40) if set
		 */
		info.si_code = FPE_FLTINV;
766
	} else if (err & 0x004) { /* Divide by Zero */
I
Ingo Molnar 已提交
767
		info.si_code = FPE_FLTDIV;
768
	} else if (err & 0x008) { /* Overflow */
I
Ingo Molnar 已提交
769
		info.si_code = FPE_FLTOVF;
770 771 772
	} else if (err & 0x012) { /* Denormal, Underflow */
		info.si_code = FPE_FLTUND;
	} else if (err & 0x020) { /* Precision */
I
Ingo Molnar 已提交
773
		info.si_code = FPE_FLTRES;
774
	} else {
775
		/*
776 777 778
		 * If we're using IRQ 13, or supposedly even some trap
		 * X86_TRAP_MF implementations, it's possible
		 * we get a spurious trap, which is not an error.
779
		 */
780
		return;
L
Linus Torvalds 已提交
781 782 783 784
	}
	force_sig_info(SIGFPE, &info, task);
}

785
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
L
Linus Torvalds 已提交
786
{
787 788 789
	enum ctx_state prev_state;

	prev_state = exception_enter();
790
	math_error(regs, error_code, X86_TRAP_MF);
791
	exception_exit(prev_state);
L
Linus Torvalds 已提交
792 793
}

794 795
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
L
Linus Torvalds 已提交
796
{
797 798 799
	enum ctx_state prev_state;

	prev_state = exception_enter();
800
	math_error(regs, error_code, X86_TRAP_XF);
801
	exception_exit(prev_state);
L
Linus Torvalds 已提交
802 803
}

804 805
dotraplinkage void
do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
L
Linus Torvalds 已提交
806
{
807
	conditional_sti(regs);
L
Linus Torvalds 已提交
808 809
#if 0
	/* No need to warn about this any longer. */
810
	pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
L
Linus Torvalds 已提交
811 812 813
#endif
}

814
asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
L
Linus Torvalds 已提交
815 816
{
}
817

818
asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
819 820 821
{
}

L
Linus Torvalds 已提交
822
/*
I
Ingo Molnar 已提交
823
 * 'math_state_restore()' saves the current math information in the
L
Linus Torvalds 已提交
824 825 826 827 828
 * old math state array, and gets the new ones from the current task
 *
 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
 * Don't touch unless you *really* know how it works.
 *
829 830
 * Must be called with kernel preemption disabled (eg with local
 * local interrupts as in the case of do_device_not_available).
L
Linus Torvalds 已提交
831
 */
832
void math_state_restore(void)
L
Linus Torvalds 已提交
833
{
834
	struct task_struct *tsk = current;
L
Linus Torvalds 已提交
835

836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
	if (!tsk_used_math(tsk)) {
		local_irq_enable();
		/*
		 * does a slab alloc which can sleep
		 */
		if (init_fpu(tsk)) {
			/*
			 * ran out of memory!
			 */
			do_group_exit(SIGKILL);
			return;
		}
		local_irq_disable();
	}

851
	__thread_fpu_begin(tsk);
852

853 854 855 856
	/*
	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
	 */
	if (unlikely(restore_fpu_checking(tsk))) {
857
		drop_init_fpu(tsk);
858
		force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
859 860
		return;
	}
861

862
	tsk->thread.fpu_counter++;
L
Linus Torvalds 已提交
863
}
864
EXPORT_SYMBOL_GPL(math_state_restore);
L
Linus Torvalds 已提交
865

866
dotraplinkage void
867
do_device_not_available(struct pt_regs *regs, long error_code)
868
{
869 870 871
	enum ctx_state prev_state;

	prev_state = exception_enter();
872
	BUG_ON(use_eager_fpu());
873

874
#ifdef CONFIG_MATH_EMULATION
875
	if (read_cr0() & X86_CR0_EM) {
876 877
		struct math_emu_info info = { };

878
		conditional_sti(regs);
879

880
		info.regs = regs;
881
		math_emulate(&info);
882
		exception_exit(prev_state);
883
		return;
884
	}
885 886 887 888
#endif
	math_state_restore(); /* interrupts still off */
#ifdef CONFIG_X86_32
	conditional_sti(regs);
889
#endif
890
	exception_exit(prev_state);
891
}
892
NOKPROBE_SYMBOL(do_device_not_available);
893

894
#ifdef CONFIG_X86_32
895
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
896 897
{
	siginfo_t info;
898
	enum ctx_state prev_state;
899

900
	prev_state = exception_enter();
901 902 903 904 905
	local_irq_enable();

	info.si_signo = SIGILL;
	info.si_errno = 0;
	info.si_code = ILL_BADSTK;
906
	info.si_addr = NULL;
907
	if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
908 909 910 911
			X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
			&info);
	}
912
	exception_exit(prev_state);
913
}
914
#endif
915

916 917 918
/* Set of traps needed for early debugging. */
void __init early_trap_init(void)
{
919
	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
920
	/* int3 can be called from all */
921
	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
922
#ifdef CONFIG_X86_32
923
	set_intr_gate(X86_TRAP_PF, page_fault);
924
#endif
925 926 927
	load_idt(&idt_descr);
}

928 929 930
void __init early_trap_pf_init(void)
{
#ifdef CONFIG_X86_64
931
	set_intr_gate(X86_TRAP_PF, page_fault);
932 933 934
#endif
}

L
Linus Torvalds 已提交
935 936
void __init trap_init(void)
{
937 938
	int i;

L
Linus Torvalds 已提交
939
#ifdef CONFIG_EISA
I
Ingo Molnar 已提交
940
	void __iomem *p = early_ioremap(0x0FFFD9, 4);
I
Ingo Molnar 已提交
941 942

	if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
L
Linus Torvalds 已提交
943
		EISA_bus = 1;
I
Ingo Molnar 已提交
944
	early_iounmap(p, 4);
L
Linus Torvalds 已提交
945 946
#endif

947
	set_intr_gate(X86_TRAP_DE, divide_error);
948
	set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
949
	/* int4 can be called from all */
950
	set_system_intr_gate(X86_TRAP_OF, &overflow);
951 952 953
	set_intr_gate(X86_TRAP_BR, bounds);
	set_intr_gate(X86_TRAP_UD, invalid_op);
	set_intr_gate(X86_TRAP_NM, device_not_available);
954
#ifdef CONFIG_X86_32
955
	set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
956
#else
957
	set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
958
#endif
959 960 961
	set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
	set_intr_gate(X86_TRAP_TS, invalid_TSS);
	set_intr_gate(X86_TRAP_NP, segment_not_present);
962
	set_intr_gate(X86_TRAP_SS, stack_segment);
963 964 965 966
	set_intr_gate(X86_TRAP_GP, general_protection);
	set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
	set_intr_gate(X86_TRAP_MF, coprocessor_error);
	set_intr_gate(X86_TRAP_AC, alignment_check);
L
Linus Torvalds 已提交
967
#ifdef CONFIG_X86_MCE
968
	set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
L
Linus Torvalds 已提交
969
#endif
970
	set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
L
Linus Torvalds 已提交
971

972 973 974 975
	/* Reserve all the builtin and the syscall vector: */
	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
		set_bit(i, used_vectors);

976 977
#ifdef CONFIG_IA32_EMULATION
	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
978
	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
979 980 981
#endif

#ifdef CONFIG_X86_32
982
	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
983
	set_bit(SYSCALL_VECTOR, used_vectors);
984
#endif
985

986 987 988 989 990 991 992 993
	/*
	 * Set the IDT descriptor to a fixed read-only location, so that the
	 * "sidt" instruction will not leak the location of the kernel, and
	 * to defend the IDT against arbitrary memory write vulnerabilities.
	 * It will be reloaded in cpu_init() */
	__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
	idt_descr.address = fix_to_virt(FIX_RO_IDT);

L
Linus Torvalds 已提交
994
	/*
I
Ingo Molnar 已提交
995
	 * Should be a barrier for any external CPU state:
L
Linus Torvalds 已提交
996 997 998
	 */
	cpu_init();

999
	x86_init.irqs.trap_init();
1000 1001

#ifdef CONFIG_X86_64
1002
	memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
1003 1004
	set_nmi_gate(X86_TRAP_DB, &debug);
	set_nmi_gate(X86_TRAP_BP, &int3);
1005
#endif
L
Linus Torvalds 已提交
1006
}