entry_32.S 28.8 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
2
 *  Copyright (C) 1991,1992  Linus Torvalds
L
Linus Torvalds 已提交
3
 *
4
 * entry_32.S contains the system-call and low-level fault and trap handling routines.
L
Linus Torvalds 已提交
5
 *
6
 * Stack layout while running C code:
7 8 9
 *	ptrace needs to have all registers on the stack.
 *	If the order here is changed, it needs to be
 *	updated in fork.c:copy_process(), signal.c:do_signal(),
L
Linus Torvalds 已提交
10 11 12 13 14
 *	ptrace.c and ptrace.h
 *
 *	 0(%esp) - %ebx
 *	 4(%esp) - %ecx
 *	 8(%esp) - %edx
15
 *	 C(%esp) - %esi
L
Linus Torvalds 已提交
16 17 18 19 20
 *	10(%esp) - %edi
 *	14(%esp) - %ebp
 *	18(%esp) - %eax
 *	1C(%esp) - %ds
 *	20(%esp) - %es
21
 *	24(%esp) - %fs
22 23 24 25 26 27 28
 *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
 *	2C(%esp) - orig_eax
 *	30(%esp) - %eip
 *	34(%esp) - %cs
 *	38(%esp) - %eflags
 *	3C(%esp) - %oldesp
 *	40(%esp) - %oldss
L
Linus Torvalds 已提交
29 30 31
 */

#include <linux/linkage.h>
32
#include <linux/err.h>
L
Linus Torvalds 已提交
33
#include <asm/thread_info.h>
34
#include <asm/irqflags.h>
L
Linus Torvalds 已提交
35 36 37
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
38
#include <asm/page_types.h>
S
Stas Sergeev 已提交
39
#include <asm/percpu.h>
40
#include <asm/processor-flags.h>
41
#include <asm/ftrace.h>
42
#include <asm/irq_vectors.h>
43
#include <asm/cpufeatures.h>
44
#include <asm/alternative-asm.h>
45
#include <asm/asm.h>
46
#include <asm/smap.h>
47
#include <asm/export.h>
48
#include <asm/frame.h>
L
Linus Torvalds 已提交
49

J
Jiri Olsa 已提交
50 51
	.section .entry.text, "ax"

52 53 54 55 56
/*
 * We use macros for low-level operations which need to be overridden
 * for paravirtualization.  The following will never clobber any registers:
 *   INTERRUPT_RETURN (aka. "iret")
 *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
57
 *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
58 59 60 61 62 63 64
 *
 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
 * Allowing a register to be clobbered can shrink the paravirt replacement
 * enough to patch inline, increasing performance.
 */

L
Linus Torvalds 已提交
65
#ifdef CONFIG_PREEMPT
66
# define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
L
Linus Torvalds 已提交
67
#else
68 69
# define preempt_stop(clobbers)
# define resume_kernel		restore_all
L
Linus Torvalds 已提交
70 71
#endif

72 73
.macro TRACE_IRQS_IRET
#ifdef CONFIG_TRACE_IRQFLAGS
74 75
	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
	jz	1f
76 77 78 79 80
	TRACE_IRQS_ON
1:
#endif
.endm

81 82 83 84 85 86 87 88 89 90 91 92 93
/*
 * User gs save/restore
 *
 * %gs is used for userland TLS and kernel only uses it for stack
 * canary which is required to be at %gs:20 by gcc.  Read the comment
 * at the top of stackprotector.h for more info.
 *
 * Local labels 98 and 99 are used.
 */
#ifdef CONFIG_X86_32_LAZY_GS

 /* unfortunately push/pop can't be no-op */
.macro PUSH_GS
94
	pushl	$0
95 96
.endm
.macro POP_GS pop=0
97
	addl	$(4 + \pop), %esp
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
.endm
.macro POP_GS_EX
.endm

 /* all the rest are no-op */
.macro PTGS_TO_GS
.endm
.macro PTGS_TO_GS_EX
.endm
.macro GS_TO_REG reg
.endm
.macro REG_TO_PTGS reg
.endm
.macro SET_KERNEL_GS reg
.endm

#else	/* CONFIG_X86_32_LAZY_GS */

.macro PUSH_GS
117
	pushl	%gs
118 119 120
.endm

.macro POP_GS pop=0
121
98:	popl	%gs
122
  .if \pop <> 0
123
	add	$\pop, %esp
124 125 126 127
  .endif
.endm
.macro POP_GS_EX
.pushsection .fixup, "ax"
128 129
99:	movl	$0, (%esp)
	jmp	98b
130
.popsection
131
	_ASM_EXTABLE(98b, 99b)
132 133 134
.endm

.macro PTGS_TO_GS
135
98:	mov	PT_GS(%esp), %gs
136 137 138
.endm
.macro PTGS_TO_GS_EX
.pushsection .fixup, "ax"
139 140
99:	movl	$0, PT_GS(%esp)
	jmp	98b
141
.popsection
142
	_ASM_EXTABLE(98b, 99b)
143 144 145
.endm

.macro GS_TO_REG reg
146
	movl	%gs, \reg
147 148
.endm
.macro REG_TO_PTGS reg
149
	movl	\reg, PT_GS(%esp)
150 151
.endm
.macro SET_KERNEL_GS reg
152 153
	movl	$(__KERNEL_STACK_CANARY), \reg
	movl	\reg, %gs
154 155
.endm

156
#endif /* CONFIG_X86_32_LAZY_GS */
157

158
.macro SAVE_ALL pt_regs_ax=%eax
159
	cld
160
	PUSH_GS
161 162 163
	pushl	%fs
	pushl	%es
	pushl	%ds
164
	pushl	\pt_regs_ax
165 166 167 168 169 170 171 172 173 174 175
	pushl	%ebp
	pushl	%edi
	pushl	%esi
	pushl	%edx
	pushl	%ecx
	pushl	%ebx
	movl	$(__USER_DS), %edx
	movl	%edx, %ds
	movl	%edx, %es
	movl	$(__KERNEL_PERCPU), %edx
	movl	%edx, %fs
176
	SET_KERNEL_GS %edx
177
.endm
L
Linus Torvalds 已提交
178

179
.macro RESTORE_INT_REGS
180 181 182 183 184 185 186
	popl	%ebx
	popl	%ecx
	popl	%edx
	popl	%esi
	popl	%edi
	popl	%ebp
	popl	%eax
187
.endm
L
Linus Torvalds 已提交
188

189
.macro RESTORE_REGS pop=0
190
	RESTORE_INT_REGS
191 192 193
1:	popl	%ds
2:	popl	%es
3:	popl	%fs
194
	POP_GS \pop
195
.pushsection .fixup, "ax"
196 197 198 199 200 201
4:	movl	$0, (%esp)
	jmp	1b
5:	movl	$0, (%esp)
	jmp	2b
6:	movl	$0, (%esp)
	jmp	3b
202
.popsection
203 204 205
	_ASM_EXTABLE(1b, 4b)
	_ASM_EXTABLE(2b, 5b)
	_ASM_EXTABLE(3b, 6b)
206
	POP_GS_EX
207
.endm
L
Linus Torvalds 已提交
208

209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
/*
 * %eax: prev task
 * %edx: next task
 */
ENTRY(__switch_to_asm)
	/*
	 * Save callee-saved registers
	 * This must match the order in struct inactive_task_frame
	 */
	pushl	%ebp
	pushl	%ebx
	pushl	%edi
	pushl	%esi

	/* switch stack */
	movl	%esp, TASK_threadsp(%eax)
	movl	TASK_threadsp(%edx), %esp

#ifdef CONFIG_CC_STACKPROTECTOR
	movl	TASK_stack_canary(%edx), %ebx
	movl	%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
#endif

	/* restore callee-saved registers */
	popl	%esi
	popl	%edi
	popl	%ebx
	popl	%ebp

	jmp	__switch_to
END(__switch_to_asm)

241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
/*
 * The unwinder expects the last frame on the stack to always be at the same
 * offset from the end of the page, which allows it to validate the stack.
 * Calling schedule_tail() directly would break that convention because its an
 * asmlinkage function so its argument has to be pushed on the stack.  This
 * wrapper creates a proper "end of stack" frame header before the call.
 */
ENTRY(schedule_tail_wrapper)
	FRAME_BEGIN

	pushl	%eax
	call	schedule_tail
	popl	%eax

	FRAME_END
	ret
ENDPROC(schedule_tail_wrapper)
258 259 260 261
/*
 * A newly forked process directly context switches into this address.
 *
 * eax: prev task we switched from
262 263
 * ebx: kernel thread func (NULL for user thread)
 * edi: kernel thread arg
264
 */
L
Linus Torvalds 已提交
265
ENTRY(ret_from_fork)
266
	call	schedule_tail_wrapper
267

268 269 270 271
	testl	%ebx, %ebx
	jnz	1f		/* kernel threads are uncommon */

2:
272 273 274 275 276
	/* When we fork, we trace the syscall return in the child, too. */
	movl    %esp, %eax
	call    syscall_return_slowpath
	jmp     restore_all

277 278 279
	/* kernel thread */
1:	movl	%edi, %eax
	call	*%ebx
280
	/*
281 282 283
	 * A kernel thread is allowed to return here after successfully
	 * calling do_execve().  Exit to userspace to complete the execve()
	 * syscall.
284
	 */
285 286 287
	movl	$0, PT_EAX(%esp)
	jmp	2b
END(ret_from_fork)
288

L
Linus Torvalds 已提交
289 290 291 292 293 294 295 296 297 298
/*
 * Return to user mode is not as complex as all this looks,
 * but we want the default path for a system call return to
 * go as quickly as possible which is why some of this is
 * less clear than it otherwise should be.
 */

	# userspace resumption stub bypassing syscall exit tracing
	ALIGN
ret_from_exception:
299
	preempt_stop(CLBR_ANY)
L
Linus Torvalds 已提交
300
ret_from_intr:
301
#ifdef CONFIG_VM86
302 303 304
	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
	movb	PT_CS(%esp), %al
	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
305 306
#else
	/*
307
	 * We can be coming here from child spawned by kernel_thread().
308
	 */
309 310
	movl	PT_CS(%esp), %eax
	andl	$SEGMENT_RPL_MASK, %eax
311
#endif
312 313
	cmpl	$USER_RPL, %eax
	jb	resume_kernel			# not returning to v8086 or userspace
314

L
Linus Torvalds 已提交
315
ENTRY(resume_userspace)
316
	DISABLE_INTERRUPTS(CLBR_ANY)
317
	TRACE_IRQS_OFF
318 319
	movl	%esp, %eax
	call	prepare_exit_to_usermode
320
	jmp	restore_all
321
END(ret_from_exception)
L
Linus Torvalds 已提交
322 323 324

#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
325
	DISABLE_INTERRUPTS(CLBR_ANY)
326
.Lneed_resched:
327 328 329 330 331
	cmpl	$0, PER_CPU_VAR(__preempt_count)
	jnz	restore_all
	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
	jz	restore_all
	call	preempt_schedule_irq
332
	jmp	.Lneed_resched
333
END(resume_kernel)
L
Linus Torvalds 已提交
334 335
#endif

336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
GLOBAL(__begin_SYSENTER_singlestep_region)
/*
 * All code from here through __end_SYSENTER_singlestep_region is subject
 * to being single-stepped if a user program sets TF and executes SYSENTER.
 * There is absolutely nothing that we can do to prevent this from happening
 * (thanks Intel!).  To keep our handling of this situation as simple as
 * possible, we handle TF just like AC and NT, except that our #DB handler
 * will ignore all of the single-step traps generated in this range.
 */

#ifdef CONFIG_XEN
/*
 * Xen doesn't set %esp to be precisely what the normal SYSENTER
 * entry point expects, so fix it up before using the normal path.
 */
ENTRY(xen_sysenter_target)
	addl	$5*4, %esp			/* remove xen-provided frame */
353
	jmp	.Lsysenter_past_esp
354 355
#endif

356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
/*
 * 32-bit SYSENTER entry.
 *
 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
 * if X86_FEATURE_SEP is available.  This is the preferred system call
 * entry on 32-bit systems.
 *
 * The SYSENTER instruction, in principle, should *only* occur in the
 * vDSO.  In practice, a small number of Android devices were shipped
 * with a copy of Bionic that inlined a SYSENTER instruction.  This
 * never happened in any of Google's Bionic versions -- it only happened
 * in a narrow range of Intel-provided versions.
 *
 * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
 * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
 * SYSENTER does not save anything on the stack,
 * and does not save old EIP (!!!), ESP, or EFLAGS.
 *
 * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
 * user and/or vm86 state), we explicitly disable the SYSENTER
 * instruction in vm86 mode by reprogramming the MSRs.
 *
 * Arguments:
 * eax  system call number
 * ebx  arg1
 * ecx  arg2
 * edx  arg3
 * esi  arg4
 * edi  arg5
 * ebp  user stack
 * 0(%ebp) arg6
 */
388
ENTRY(entry_SYSENTER_32)
389
	movl	TSS_sysenter_sp0(%esp), %esp
390
.Lsysenter_past_esp:
391
	pushl	$__USER_DS		/* pt_regs->ss */
392
	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
393 394 395 396 397 398 399
	pushfl				/* pt_regs->flags (except IF = 0) */
	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
	pushl	$__USER_CS		/* pt_regs->cs */
	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
	pushl	%eax			/* pt_regs->orig_ax */
	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */

400
	/*
401 402
	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
	 * and TF ourselves.  To save a few cycles, we can check whether
403 404 405 406
	 * either was set instead of doing an unconditional popfq.
	 * This needs to happen before enabling interrupts so that
	 * we don't get preempted with NT set.
	 *
407 408 409 410 411 412
	 * If TF is set, we will single-step all the way to here -- do_debug
	 * will ignore all the traps.  (Yes, this is slow, but so is
	 * single-stepping in general.  This allows us to avoid having
	 * a more complicated code to handle the case where a user program
	 * forces us to single-step through the SYSENTER entry code.)
	 *
413 414 415 416 417 418
	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
	 * out-of-line as an optimization: NT is unlikely to be set in the
	 * majority of the cases and instead of polluting the I$ unnecessarily,
	 * we're keeping that code behind a branch which will predict as
	 * not-taken and therefore its instructions won't be fetched.
	 */
419
	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
420 421 422
	jnz	.Lsysenter_fix_flags
.Lsysenter_flags_fixed:

423
	/*
424 425
	 * User mode is traced as though IRQs are on, and SYSENTER
	 * turned them off.
426
	 */
427
	TRACE_IRQS_OFF
428 429 430

	movl	%esp, %eax
	call	do_fast_syscall_32
431 432 433
	/* XEN PV guests always use IRET path */
	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
434 435 436 437 438

/* Opportunistic SYSEXIT */
	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
	movl	PT_EIP(%esp), %edx	/* pt_regs->ip */
	movl	PT_OLDESP(%esp), %ecx	/* pt_regs->sp */
439 440
1:	mov	PT_FS(%esp), %fs
	PTGS_TO_GS
441 442 443 444 445 446 447
	popl	%ebx			/* pt_regs->bx */
	addl	$2*4, %esp		/* skip pt_regs->cx and pt_regs->dx */
	popl	%esi			/* pt_regs->si */
	popl	%edi			/* pt_regs->di */
	popl	%ebp			/* pt_regs->bp */
	popl	%eax			/* pt_regs->ax */

448 449 450 451 452 453 454 455 456
	/*
	 * Restore all flags except IF. (We restore IF separately because
	 * STI gives a one-instruction window in which we won't be interrupted,
	 * whereas POPF does not.)
	 */
	addl	$PT_EFLAGS-PT_DS, %esp	/* point esp at pt_regs->flags */
	btr	$X86_EFLAGS_IF_BIT, (%esp)
	popfl

457 458 459 460
	/*
	 * Return back to the vDSO, which will pop ecx and edx.
	 * Don't bother with DS and ES (they already contain __USER_DS).
	 */
461 462
	sti
	sysexit
R
Roland McGrath 已提交
463

464 465 466
.pushsection .fixup, "ax"
2:	movl	$0, PT_FS(%esp)
	jmp	1b
467
.popsection
468
	_ASM_EXTABLE(1b, 2b)
469
	PTGS_TO_GS_EX
470 471 472 473 474

.Lsysenter_fix_flags:
	pushl	$X86_EFLAGS_FIXED
	popfl
	jmp	.Lsysenter_flags_fixed
475
GLOBAL(__end_SYSENTER_singlestep_region)
476
ENDPROC(entry_SYSENTER_32)
L
Linus Torvalds 已提交
477

478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
/*
 * 32-bit legacy system call entry.
 *
 * 32-bit x86 Linux system calls traditionally used the INT $0x80
 * instruction.  INT $0x80 lands here.
 *
 * This entry point can be used by any 32-bit perform system calls.
 * Instances of INT $0x80 can be found inline in various programs and
 * libraries.  It is also used by the vDSO's __kernel_vsyscall
 * fallback for hardware that doesn't support a faster entry method.
 * Restarted 32-bit system calls also fall back to INT $0x80
 * regardless of what instruction was originally used to do the system
 * call.  (64-bit programs can use INT $0x80 as well, but they can
 * only run on 64-bit kernels and therefore land in
 * entry_INT80_compat.)
 *
 * This is considered a slow path.  It is not used by most libc
 * implementations on modern hardware except during process startup.
 *
 * Arguments:
 * eax  system call number
 * ebx  arg1
 * ecx  arg2
 * edx  arg3
 * esi  arg4
 * edi  arg5
 * ebp  arg6
 */
506
ENTRY(entry_INT80_32)
507
	ASM_CLAC
508
	pushl	%eax			/* pt_regs->orig_ax */
509
	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
510 511

	/*
512 513
	 * User mode is traced as though IRQs are on, and the interrupt gate
	 * turned them off.
514
	 */
515
	TRACE_IRQS_OFF
516 517

	movl	%esp, %eax
518
	call	do_int80_syscall_32
519
.Lsyscall_32_done:
L
Linus Torvalds 已提交
520 521

restore_all:
522
	TRACE_IRQS_IRET
523
.Lrestore_all_notrace:
524
#ifdef CONFIG_X86_ESPFIX32
525
	ALTERNATIVE	"jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX
526

527 528 529 530 531 532 533 534 535 536
	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
	/*
	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
	 * are returning to the kernel.
	 * See comments in process.c:copy_thread() for details.
	 */
	movb	PT_OLDSS(%esp), %ah
	movb	PT_CS(%esp), %al
	andl	$(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
	cmpl	$((SEGMENT_LDT << 8) | USER_RPL), %eax
537
	je .Lldt_ss				# returning to user-space with LDT SS
538
#endif
539
.Lrestore_nocheck:
540
	RESTORE_REGS 4				# skip orig_eax/error_code
541
.Lirq_return:
I
Ingo Molnar 已提交
542
	INTERRUPT_RETURN
543

544 545 546 547
.section .fixup, "ax"
ENTRY(iret_exc	)
	pushl	$0				# no error code
	pushl	$do_iret_error
548
	jmp	common_exception
L
Linus Torvalds 已提交
549
.previous
550
	_ASM_EXTABLE(.Lirq_return, iret_exc)
L
Linus Torvalds 已提交
551

552
#ifdef CONFIG_X86_ESPFIX32
553
.Lldt_ss:
554 555 556 557 558 559 560 561 562 563 564
/*
 * Setup and switch to ESPFIX stack
 *
 * We're returning to userspace with a 16 bit stack. The CPU will not
 * restore the high word of ESP for us on executing iret... This is an
 * "official" bug of all the x86-compatible CPUs, which we can work
 * around to make dosemu and wine happy. We do this by preloading the
 * high word of ESP with the high word of the userspace ESP while
 * compensating for the offset by changing to the ESPFIX segment with
 * a base address that matches for the difference.
 */
565
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
566 567 568
	mov	%esp, %edx			/* load kernel esp */
	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
	mov	%dx, %ax			/* eax: new kernel esp */
569 570
	sub	%eax, %edx			/* offset (low word is 0) */
	shr	$16, %edx
571 572 573 574 575 576
	mov	%dl, GDT_ESPFIX_SS + 4		/* bits 16..23 */
	mov	%dh, GDT_ESPFIX_SS + 7		/* bits 24..31 */
	pushl	$__ESPFIX_SS
	pushl	%eax				/* new kernel esp */
	/*
	 * Disable interrupts, but do not irqtrace this section: we
577
	 * will soon execute iret and the tracer was already set to
578 579
	 * the irqstate after the IRET:
	 */
580
	DISABLE_INTERRUPTS(CLBR_EAX)
581
	lss	(%esp), %esp			/* switch to espfix segment */
582
	jmp	.Lrestore_nocheck
583
#endif
584
ENDPROC(entry_INT80_32)
L
Linus Torvalds 已提交
585

586
.macro FIXUP_ESPFIX_STACK
587 588 589 590 591 592 593
/*
 * Switch back for ESPFIX stack to the normal zerobased stack
 *
 * We can't call C functions using the ESPFIX stack. This code reads
 * the high word of the segment base from the GDT and swiches to the
 * normal stack and adjusts ESP with the matching offset.
 */
594
#ifdef CONFIG_X86_ESPFIX32
595
	/* fixup the stack */
596 597
	mov	GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
	mov	GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
598
	shl	$16, %eax
599 600 601 602
	addl	%esp, %eax			/* the adjusted stack pointer */
	pushl	$__KERNEL_DS
	pushl	%eax
	lss	(%esp), %esp			/* switch to the normal stack segment */
603
#endif
604 605
.endm
.macro UNWIND_ESPFIX_STACK
606
#ifdef CONFIG_X86_ESPFIX32
607
	movl	%ss, %eax
608
	/* see if on espfix stack */
609 610 611 612 613
	cmpw	$__ESPFIX_SS, %ax
	jne	27f
	movl	$__KERNEL_DS, %eax
	movl	%eax, %ds
	movl	%eax, %es
614 615 616
	/* switch to normal stack */
	FIXUP_ESPFIX_STACK
27:
617
#endif
618
.endm
L
Linus Torvalds 已提交
619 620

/*
621 622
 * Build the entry stubs with some assembler magic.
 * We pack 1 stub into every 8-byte block.
L
Linus Torvalds 已提交
623
 */
624
	.align 8
L
Linus Torvalds 已提交
625
ENTRY(irq_entries_start)
626 627
    vector=FIRST_EXTERNAL_VECTOR
    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
628
	pushl	$(~vector+0x80)			/* Note: always in signed byte range */
629 630 631 632
    vector=vector+1
	jmp	common_interrupt
	.align	8
    .endr
633 634
END(irq_entries_start)

635 636 637 638
/*
 * the CPU automatically disables interrupts when executing an IRQ vector,
 * so IRQ-flags tracing has to follow that:
 */
639
	.p2align CONFIG_X86_L1_CACHE_SHIFT
L
Linus Torvalds 已提交
640
common_interrupt:
641
	ASM_CLAC
642
	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
L
Linus Torvalds 已提交
643
	SAVE_ALL
644
	TRACE_IRQS_OFF
645 646 647
	movl	%esp, %eax
	call	do_IRQ
	jmp	ret_from_intr
648
ENDPROC(common_interrupt)
L
Linus Torvalds 已提交
649

T
Tejun Heo 已提交
650
#define BUILD_INTERRUPT3(name, nr, fn)	\
L
Linus Torvalds 已提交
651
ENTRY(name)				\
652
	ASM_CLAC;			\
653
	pushl	$~(nr);			\
654
	SAVE_ALL;			\
655
	TRACE_IRQS_OFF			\
656 657 658
	movl	%esp, %eax;		\
	call	fn;			\
	jmp	ret_from_intr;		\
659
ENDPROC(name)
L
Linus Torvalds 已提交
660

661 662

#ifdef CONFIG_TRACING
663
# define TRACE_BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
664
#else
665
# define TRACE_BUILD_INTERRUPT(name, nr)
666 667
#endif

668 669
#define BUILD_INTERRUPT(name, nr)		\
	BUILD_INTERRUPT3(name, nr, smp_##name);	\
670
	TRACE_BUILD_INTERRUPT(name, nr)
T
Tejun Heo 已提交
671

L
Linus Torvalds 已提交
672
/* The include is where all of the SMP etc. interrupts come from */
673
#include <asm/entry_arch.h>
L
Linus Torvalds 已提交
674 675

ENTRY(coprocessor_error)
676
	ASM_CLAC
677 678
	pushl	$0
	pushl	$do_coprocessor_error
679
	jmp	common_exception
680
END(coprocessor_error)
L
Linus Torvalds 已提交
681 682

ENTRY(simd_coprocessor_error)
683
	ASM_CLAC
684
	pushl	$0
685 686
#ifdef CONFIG_X86_INVD_BUG
	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
687 688
	ALTERNATIVE "pushl	$do_general_protection",	\
		    "pushl	$do_simd_coprocessor_error",	\
689
		    X86_FEATURE_XMM
690
#else
691
	pushl	$do_simd_coprocessor_error
692
#endif
693
	jmp	common_exception
694
END(simd_coprocessor_error)
L
Linus Torvalds 已提交
695 696

ENTRY(device_not_available)
697
	ASM_CLAC
698 699
	pushl	$-1				# mark this as an int
	pushl	$do_device_not_available
700
	jmp	common_exception
701
END(device_not_available)
L
Linus Torvalds 已提交
702

703 704
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
I
Ingo Molnar 已提交
705
	iret
706
	_ASM_EXTABLE(native_iret, iret_exc)
707
END(native_iret)
708 709
#endif

L
Linus Torvalds 已提交
710
ENTRY(overflow)
711
	ASM_CLAC
712 713
	pushl	$0
	pushl	$do_overflow
714
	jmp	common_exception
715
END(overflow)
L
Linus Torvalds 已提交
716 717

ENTRY(bounds)
718
	ASM_CLAC
719 720
	pushl	$0
	pushl	$do_bounds
721
	jmp	common_exception
722
END(bounds)
L
Linus Torvalds 已提交
723 724

ENTRY(invalid_op)
725
	ASM_CLAC
726 727
	pushl	$0
	pushl	$do_invalid_op
728
	jmp	common_exception
729
END(invalid_op)
L
Linus Torvalds 已提交
730 731

ENTRY(coprocessor_segment_overrun)
732
	ASM_CLAC
733 734
	pushl	$0
	pushl	$do_coprocessor_segment_overrun
735
	jmp	common_exception
736
END(coprocessor_segment_overrun)
L
Linus Torvalds 已提交
737 738

ENTRY(invalid_TSS)
739
	ASM_CLAC
740
	pushl	$do_invalid_TSS
741
	jmp	common_exception
742
END(invalid_TSS)
L
Linus Torvalds 已提交
743 744

ENTRY(segment_not_present)
745
	ASM_CLAC
746
	pushl	$do_segment_not_present
747
	jmp	common_exception
748
END(segment_not_present)
L
Linus Torvalds 已提交
749 750

ENTRY(stack_segment)
751
	ASM_CLAC
752
	pushl	$do_stack_segment
753
	jmp	common_exception
754
END(stack_segment)
L
Linus Torvalds 已提交
755 756

ENTRY(alignment_check)
757
	ASM_CLAC
758
	pushl	$do_alignment_check
759
	jmp	common_exception
760
END(alignment_check)
L
Linus Torvalds 已提交
761

762
ENTRY(divide_error)
763
	ASM_CLAC
764 765
	pushl	$0				# no error code
	pushl	$do_divide_error
766
	jmp	common_exception
767
END(divide_error)
L
Linus Torvalds 已提交
768 769 770

#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
771
	ASM_CLAC
772 773
	pushl	$0
	pushl	machine_check_vector
774
	jmp	common_exception
775
END(machine_check)
L
Linus Torvalds 已提交
776 777 778
#endif

ENTRY(spurious_interrupt_bug)
779
	ASM_CLAC
780 781
	pushl	$0
	pushl	$do_spurious_interrupt_bug
782
	jmp	common_exception
783
END(spurious_interrupt_bug)
L
Linus Torvalds 已提交
784

785 786
#ifdef CONFIG_XEN
ENTRY(xen_hypervisor_callback)
787
	pushl	$-1				/* orig_ax = -1 => not a system call */
788 789
	SAVE_ALL
	TRACE_IRQS_OFF
790

791 792 793 794 795 796 797 798 799 800 801 802
	/*
	 * Check to see if we got the event in the critical
	 * region in xen_iret_direct, after we've reenabled
	 * events and checked for pending events.  This simulates
	 * iret instruction's behaviour where it delivers a
	 * pending interrupt when enabling interrupts:
	 */
	movl	PT_EIP(%esp), %eax
	cmpl	$xen_iret_start_crit, %eax
	jb	1f
	cmpl	$xen_iret_end_crit, %eax
	jae	1f
803

804
	jmp	xen_iret_crit_fixup
805 806

ENTRY(xen_do_upcall)
807 808
1:	mov	%esp, %eax
	call	xen_evtchn_do_upcall
809
#ifndef CONFIG_PREEMPT
810
	call	xen_maybe_preempt_hcall
811
#endif
812
	jmp	ret_from_intr
813 814
ENDPROC(xen_hypervisor_callback)

815 816 817 818 819 820 821 822 823 824 825 826
/*
 * Hypervisor uses this for application faults while it executes.
 * We get here for two reasons:
 *  1. Fault while reloading DS, ES, FS or GS
 *  2. Fault while executing IRET
 * Category 1 we fix up by reattempting the load, and zeroing the segment
 * register if the load fails.
 * Category 2 we fix up by jumping to do_iret_error. We cannot use the
 * normal Linux return path in this case because if we use the IRET hypercall
 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
 * We distinguish between categories by maintaining a status value in EAX.
 */
827
ENTRY(xen_failsafe_callback)
828 829 830 831 832 833
	pushl	%eax
	movl	$1, %eax
1:	mov	4(%esp), %ds
2:	mov	8(%esp), %es
3:	mov	12(%esp), %fs
4:	mov	16(%esp), %gs
834 835
	/* EAX == 0 => Category 1 (Bad segment)
	   EAX != 0 => Category 2 (Bad IRET) */
836 837 838 839 840 841
	testl	%eax, %eax
	popl	%eax
	lea	16(%esp), %esp
	jz	5f
	jmp	iret_exc
5:	pushl	$-1				/* orig_ax = -1 => not a system call */
842
	SAVE_ALL
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857
	jmp	ret_from_exception

.section .fixup, "ax"
6:	xorl	%eax, %eax
	movl	%eax, 4(%esp)
	jmp	1b
7:	xorl	%eax, %eax
	movl	%eax, 8(%esp)
	jmp	2b
8:	xorl	%eax, %eax
	movl	%eax, 12(%esp)
	jmp	3b
9:	xorl	%eax, %eax
	movl	%eax, 16(%esp)
	jmp	4b
858
.previous
859 860 861 862
	_ASM_EXTABLE(1b, 6b)
	_ASM_EXTABLE(2b, 7b)
	_ASM_EXTABLE(3b, 8b)
	_ASM_EXTABLE(4b, 9b)
863 864
ENDPROC(xen_failsafe_callback)

865
BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
866 867
		xen_evtchn_do_upcall)

868
#endif /* CONFIG_XEN */
869 870 871 872 873 874 875

#if IS_ENABLED(CONFIG_HYPERV)

BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
	hyperv_vector_handler)

#endif /* CONFIG_HYPERV */
876

877
#ifdef CONFIG_FUNCTION_TRACER
878 879 880 881 882 883 884
#ifdef CONFIG_DYNAMIC_FTRACE

ENTRY(mcount)
	ret
END(mcount)

ENTRY(ftrace_caller)
885 886 887 888 889 890 891 892
	pushl	%eax
	pushl	%ecx
	pushl	%edx
	pushl	$0				/* Pass NULL as regs pointer */
	movl	4*4(%esp), %eax
	movl	0x4(%ebp), %edx
	movl	function_trace_op, %ecx
	subl	$MCOUNT_INSN_SIZE, %eax
893 894 895

.globl ftrace_call
ftrace_call:
896
	call	ftrace_stub
897

898 899 900 901
	addl	$4, %esp			/* skip NULL pointer */
	popl	%edx
	popl	%ecx
	popl	%eax
902
.Lftrace_ret:
903 904 905
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.globl ftrace_graph_call
ftrace_graph_call:
906
	jmp	ftrace_stub
907
#endif
908 909 910 911 912 913

.globl ftrace_stub
ftrace_stub:
	ret
END(ftrace_caller)

914 915 916 917 918 919 920 921 922 923
ENTRY(ftrace_regs_caller)
	pushf	/* push flags before compare (in cs location) */

	/*
	 * i386 does not save SS and ESP when coming from kernel.
	 * Instead, to get sp, &regs->sp is used (see ptrace.h).
	 * Unfortunately, that means eflags must be at the same location
	 * as the current return ip is. We move the return ip into the
	 * ip location, and move flags into the return ip location.
	 */
924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948
	pushl	4(%esp)				/* save return ip into ip slot */

	pushl	$0				/* Load 0 into orig_ax */
	pushl	%gs
	pushl	%fs
	pushl	%es
	pushl	%ds
	pushl	%eax
	pushl	%ebp
	pushl	%edi
	pushl	%esi
	pushl	%edx
	pushl	%ecx
	pushl	%ebx

	movl	13*4(%esp), %eax		/* Get the saved flags */
	movl	%eax, 14*4(%esp)		/* Move saved flags into regs->flags location */
						/* clobbering return ip */
	movl	$__KERNEL_CS, 13*4(%esp)

	movl	12*4(%esp), %eax		/* Load ip (1st parameter) */
	subl	$MCOUNT_INSN_SIZE, %eax		/* Adjust ip */
	movl	0x4(%ebp), %edx			/* Load parent ip (2nd parameter) */
	movl	function_trace_op, %ecx		/* Save ftrace_pos in 3rd parameter */
	pushl	%esp				/* Save pt_regs as 4th parameter */
949 950

GLOBAL(ftrace_regs_call)
951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971
	call	ftrace_stub

	addl	$4, %esp			/* Skip pt_regs */
	movl	14*4(%esp), %eax		/* Move flags back into cs */
	movl	%eax, 13*4(%esp)		/* Needed to keep addl	from modifying flags */
	movl	12*4(%esp), %eax		/* Get return ip from regs->ip */
	movl	%eax, 14*4(%esp)		/* Put return ip back for ret */

	popl	%ebx
	popl	%ecx
	popl	%edx
	popl	%esi
	popl	%edi
	popl	%ebp
	popl	%eax
	popl	%ds
	popl	%es
	popl	%fs
	popl	%gs
	addl	$8, %esp			/* Skip orig_ax and ip */
	popf					/* Pop flags at end (no addl to corrupt flags) */
972
	jmp	.Lftrace_ret
973 974

	popf
975
	jmp	ftrace_stub
976 977
#else /* ! CONFIG_DYNAMIC_FTRACE */

978
ENTRY(mcount)
979 980
	cmpl	$__PAGE_OFFSET, %esp
	jb	ftrace_stub			/* Paging not enabled yet? */
981

982
	cmpl	$ftrace_stub, ftrace_trace_function
983
	jnz	.Ltrace
984
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
985 986
	cmpl	$ftrace_stub, ftrace_graph_return
	jnz	ftrace_graph_caller
987

988 989
	cmpl	$ftrace_graph_entry_stub, ftrace_graph_entry
	jnz	ftrace_graph_caller
990
#endif
991 992 993 994 995
.globl ftrace_stub
ftrace_stub:
	ret

	/* taken from glibc */
996
.Ltrace:
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
	pushl	%eax
	pushl	%ecx
	pushl	%edx
	movl	0xc(%esp), %eax
	movl	0x4(%ebp), %edx
	subl	$MCOUNT_INSN_SIZE, %eax

	call	*ftrace_trace_function

	popl	%edx
	popl	%ecx
	popl	%eax
	jmp	ftrace_stub
1010
END(mcount)
1011
#endif /* CONFIG_DYNAMIC_FTRACE */
1012
EXPORT_SYMBOL(mcount)
1013
#endif /* CONFIG_FUNCTION_TRACER */
1014

1015 1016
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
	pushl	%eax
	pushl	%ecx
	pushl	%edx
	movl	0xc(%esp), %eax
	lea	0x4(%ebp), %edx
	movl	(%ebp), %ecx
	subl	$MCOUNT_INSN_SIZE, %eax
	call	prepare_ftrace_return
	popl	%edx
	popl	%ecx
	popl	%eax
1028
	ret
1029
END(ftrace_graph_caller)
1030 1031 1032

.globl return_to_handler
return_to_handler:
1033 1034 1035 1036 1037 1038 1039 1040
	pushl	%eax
	pushl	%edx
	movl	%ebp, %eax
	call	ftrace_return_to_handler
	movl	%eax, %ecx
	popl	%edx
	popl	%eax
	jmp	*%ecx
1041
#endif
1042

1043 1044 1045
#ifdef CONFIG_TRACING
ENTRY(trace_page_fault)
	ASM_CLAC
1046
	pushl	$trace_do_page_fault
1047
	jmp	common_exception
1048 1049 1050
END(trace_page_fault)
#endif

1051
ENTRY(page_fault)
1052
	ASM_CLAC
1053
	pushl	$do_page_fault
1054
	ALIGN
1055 1056 1057 1058
	jmp common_exception
END(page_fault)

common_exception:
1059
	/* the function address is in %gs's slot on the stack */
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
	pushl	%fs
	pushl	%es
	pushl	%ds
	pushl	%eax
	pushl	%ebp
	pushl	%edi
	pushl	%esi
	pushl	%edx
	pushl	%ecx
	pushl	%ebx
1070
	cld
1071 1072
	movl	$(__KERNEL_PERCPU), %ecx
	movl	%ecx, %fs
1073
	UNWIND_ESPFIX_STACK
1074
	GS_TO_REG %ecx
1075 1076 1077
	movl	PT_GS(%esp), %edi		# get the function address
	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
1078 1079
	REG_TO_PTGS %ecx
	SET_KERNEL_GS %ecx
1080 1081 1082
	movl	$(__USER_DS), %ecx
	movl	%ecx, %ds
	movl	%ecx, %es
1083
	TRACE_IRQS_OFF
1084 1085 1086
	movl	%esp, %eax			# pt_regs pointer
	call	*%edi
	jmp	ret_from_exception
1087
END(common_exception)
1088 1089

ENTRY(debug)
1090 1091 1092 1093 1094 1095 1096 1097 1098
	/*
	 * #DB can happen at the first instruction of
	 * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
	 * happens, then we will be running on a very small stack.  We
	 * need to detect this condition and switch to the thread
	 * stack before calling any C code at all.
	 *
	 * If you edit this code, keep in mind that NMIs can happen in here.
	 */
1099
	ASM_CLAC
1100
	pushl	$-1				# mark this as an int
1101
	SAVE_ALL
1102 1103
	xorl	%edx, %edx			# error code 0
	movl	%esp, %eax			# pt_regs pointer
1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119

	/* Are we currently on the SYSENTER stack? */
	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
	cmpl	$SIZEOF_SYSENTER_stack, %ecx
	jb	.Ldebug_from_sysenter_stack

	TRACE_IRQS_OFF
	call	do_debug
	jmp	ret_from_exception

.Ldebug_from_sysenter_stack:
	/* We're on the SYSENTER stack.  Switch off. */
	movl	%esp, %ebp
	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
	TRACE_IRQS_OFF
1120
	call	do_debug
1121
	movl	%ebp, %esp
1122
	jmp	ret_from_exception
1123 1124 1125
END(debug)

/*
1126 1127 1128 1129 1130
 * NMI is doubly nasty.  It can happen on the first instruction of
 * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
 * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
 * switched stacks.  We handle both conditions by simply checking whether we
 * interrupted kernel code running on the SYSENTER stack.
1131 1132
 */
ENTRY(nmi)
1133
	ASM_CLAC
1134
#ifdef CONFIG_X86_ESPFIX32
1135 1136 1137 1138
	pushl	%eax
	movl	%ss, %eax
	cmpw	$__ESPFIX_SS, %ax
	popl	%eax
1139
	je	.Lnmi_espfix_stack
1140
#endif
1141 1142

	pushl	%eax				# pt_regs->orig_ax
1143
	SAVE_ALL
1144 1145
	xorl	%edx, %edx			# zero error code
	movl	%esp, %eax			# pt_regs pointer
1146 1147 1148 1149 1150 1151 1152 1153

	/* Are we currently on the SYSENTER stack? */
	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
	cmpl	$SIZEOF_SYSENTER_stack, %ecx
	jb	.Lnmi_from_sysenter_stack

	/* Not on SYSENTER stack. */
1154
	call	do_nmi
1155
	jmp	.Lrestore_all_notrace
1156

1157 1158 1159 1160 1161 1162 1163 1164 1165
.Lnmi_from_sysenter_stack:
	/*
	 * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
	 * is using the thread stack right now, so it's safe for us to use it.
	 */
	movl	%esp, %ebp
	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
	call	do_nmi
	movl	%ebp, %esp
1166
	jmp	.Lrestore_all_notrace
1167

1168
#ifdef CONFIG_X86_ESPFIX32
1169
.Lnmi_espfix_stack:
1170
	/*
1171 1172
	 * create the pointer to lss back
	 */
1173 1174 1175
	pushl	%ss
	pushl	%esp
	addl	$4, (%esp)
1176 1177
	/* copy the iret frame of 12 bytes */
	.rept 3
1178
	pushl	16(%esp)
1179
	.endr
1180
	pushl	%eax
1181
	SAVE_ALL
1182 1183 1184
	FIXUP_ESPFIX_STACK			# %eax == %esp
	xorl	%edx, %edx			# zero error code
	call	do_nmi
1185
	RESTORE_REGS
1186
	lss	12+4(%esp), %esp		# back to espfix stack
1187
	jmp	.Lirq_return
1188
#endif
1189 1190 1191
END(nmi)

ENTRY(int3)
1192
	ASM_CLAC
1193
	pushl	$-1				# mark this as an int
1194 1195
	SAVE_ALL
	TRACE_IRQS_OFF
1196 1197 1198 1199
	xorl	%edx, %edx			# zero error code
	movl	%esp, %eax			# pt_regs pointer
	call	do_int3
	jmp	ret_from_exception
1200 1201 1202
END(int3)

ENTRY(general_protection)
1203
	pushl	$do_general_protection
1204
	jmp	common_exception
1205 1206
END(general_protection)

G
Gleb Natapov 已提交
1207 1208
#ifdef CONFIG_KVM_GUEST
ENTRY(async_page_fault)
1209
	ASM_CLAC
1210
	pushl	$do_async_page_fault
1211
	jmp	common_exception
1212
END(async_page_fault)
G
Gleb Natapov 已提交
1213
#endif
1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224

ENTRY(rewind_stack_do_exit)
	/* Prevent any naive code from trying to unwind to our caller. */
	xorl	%ebp, %ebp

	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esi
	leal	-TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp

	call	do_exit
1:	jmp 1b
END(rewind_stack_do_exit)