entry_32.S 33.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * entry.S contains the system-call and fault low-level handling routines.
 * This also contains the timer-interrupt handler, as well as all interrupts
 * and faults that can result in a task-switch.
 *
 * NOTE: This code handles signal-recognition, which happens every time
 * after a timer-interrupt and after each system call.
 *
 * I changed all the .align's to 4 (16 byte alignment), as that's faster
 * on a 486.
 *
17
 * Stack layout in 'syscall_exit':
L
Linus Torvalds 已提交
18 19 20 21 22 23 24 25 26 27 28 29 30 31
 * 	ptrace needs to have all regs on the stack.
 *	if the order here is changed, it needs to be
 *	updated in fork.c:copy_process, signal.c:do_signal,
 *	ptrace.c and ptrace.h
 *
 *	 0(%esp) - %ebx
 *	 4(%esp) - %ecx
 *	 8(%esp) - %edx
 *       C(%esp) - %esi
 *	10(%esp) - %edi
 *	14(%esp) - %ebp
 *	18(%esp) - %eax
 *	1C(%esp) - %ds
 *	20(%esp) - %es
32
 *	24(%esp) - %fs
33 34 35 36 37 38 39
 *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
 *	2C(%esp) - orig_eax
 *	30(%esp) - %eip
 *	34(%esp) - %cs
 *	38(%esp) - %eflags
 *	3C(%esp) - %oldesp
 *	40(%esp) - %oldss
L
Linus Torvalds 已提交
40 41 42 43 44 45
 *
 * "current" is in register %ebx during any slow entries.
 */

#include <linux/linkage.h>
#include <asm/thread_info.h>
46
#include <asm/irqflags.h>
L
Linus Torvalds 已提交
47 48 49
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
50
#include <asm/page_types.h>
S
Stas Sergeev 已提交
51
#include <asm/percpu.h>
52
#include <asm/dwarf2.h>
53
#include <asm/processor-flags.h>
54
#include <asm/ftrace.h>
55
#include <asm/irq_vectors.h>
56
#include <asm/cpufeature.h>
L
Linus Torvalds 已提交
57

R
Roland McGrath 已提交
58 59 60 61 62 63 64 65 66 67
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
#include <linux/elf-em.h>
#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
#define __AUDIT_ARCH_LE	   0x40000000

#ifndef CONFIG_AUDITSYSCALL
#define sysenter_audit	syscall_trace_entry
#define sysexit_audit	syscall_exit_work
#endif

68 69 70 71 72
/*
 * We use macros for low-level operations which need to be overridden
 * for paravirtualization.  The following will never clobber any registers:
 *   INTERRUPT_RETURN (aka. "iret")
 *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
73
 *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
74 75 76 77 78 79 80
 *
 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
 * Allowing a register to be clobbered can shrink the paravirt replacement
 * enough to patch inline, increasing performance.
 */

L
Linus Torvalds 已提交
81 82 83
#define nr_syscalls ((syscall_table_size)/4)

#ifdef CONFIG_PREEMPT
84
#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
L
Linus Torvalds 已提交
85
#else
86
#define preempt_stop(clobbers)
87
#define resume_kernel		restore_all
L
Linus Torvalds 已提交
88 89
#endif

90 91
.macro TRACE_IRQS_IRET
#ifdef CONFIG_TRACE_IRQFLAGS
92
	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
93 94 95 96 97 98
	jz 1f
	TRACE_IRQS_ON
1:
#endif
.endm

99 100 101 102 103 104
#ifdef CONFIG_VM86
#define resume_userspace_sig	check_userspace
#else
#define resume_userspace_sig	resume_userspace
#endif

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
/*
 * User gs save/restore
 *
 * %gs is used for userland TLS and kernel only uses it for stack
 * canary which is required to be at %gs:20 by gcc.  Read the comment
 * at the top of stackprotector.h for more info.
 *
 * Local labels 98 and 99 are used.
 */
#ifdef CONFIG_X86_32_LAZY_GS

 /* unfortunately push/pop can't be no-op */
.macro PUSH_GS
	pushl $0
	CFI_ADJUST_CFA_OFFSET 4
.endm
.macro POP_GS pop=0
	addl $(4 + \pop), %esp
	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
.endm
.macro POP_GS_EX
.endm

 /* all the rest are no-op */
.macro PTGS_TO_GS
.endm
.macro PTGS_TO_GS_EX
.endm
.macro GS_TO_REG reg
.endm
.macro REG_TO_PTGS reg
.endm
.macro SET_KERNEL_GS reg
.endm

#else	/* CONFIG_X86_32_LAZY_GS */

.macro PUSH_GS
	pushl %gs
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET gs, 0*/
.endm

.macro POP_GS pop=0
98:	popl %gs
	CFI_ADJUST_CFA_OFFSET -4
	/*CFI_RESTORE gs*/
  .if \pop <> 0
	add $\pop, %esp
	CFI_ADJUST_CFA_OFFSET -\pop
  .endif
.endm
.macro POP_GS_EX
.pushsection .fixup, "ax"
99:	movl $0, (%esp)
	jmp 98b
.section __ex_table, "a"
	.align 4
	.long 98b, 99b
.popsection
.endm

.macro PTGS_TO_GS
98:	mov PT_GS(%esp), %gs
.endm
.macro PTGS_TO_GS_EX
.pushsection .fixup, "ax"
99:	movl $0, PT_GS(%esp)
	jmp 98b
.section __ex_table, "a"
	.align 4
	.long 98b, 99b
.popsection
.endm

.macro GS_TO_REG reg
	movl %gs, \reg
	/*CFI_REGISTER gs, \reg*/
.endm
.macro REG_TO_PTGS reg
	movl \reg, PT_GS(%esp)
	/*CFI_REL_OFFSET gs, PT_GS*/
.endm
.macro SET_KERNEL_GS reg
189
	movl $(__KERNEL_STACK_CANARY), \reg
190 191 192 193 194
	movl \reg, %gs
.endm

#endif	/* CONFIG_X86_32_LAZY_GS */

195 196
.macro SAVE_ALL
	cld
197
	PUSH_GS
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
	pushl %fs
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET fs, 0;*/
	pushl %es
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET es, 0;*/
	pushl %ds
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET ds, 0;*/
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET eax, 0
	pushl %ebp
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET ebp, 0
	pushl %edi
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET edi, 0
	pushl %esi
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET esi, 0
	pushl %edx
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET edx, 0
	pushl %ecx
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET ecx, 0
	pushl %ebx
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET ebx, 0
	movl $(__USER_DS), %edx
	movl %edx, %ds
	movl %edx, %es
	movl $(__KERNEL_PERCPU), %edx
232
	movl %edx, %fs
233
	SET_KERNEL_GS %edx
234
.endm
L
Linus Torvalds 已提交
235

236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
.macro RESTORE_INT_REGS
	popl %ebx
	CFI_ADJUST_CFA_OFFSET -4
	CFI_RESTORE ebx
	popl %ecx
	CFI_ADJUST_CFA_OFFSET -4
	CFI_RESTORE ecx
	popl %edx
	CFI_ADJUST_CFA_OFFSET -4
	CFI_RESTORE edx
	popl %esi
	CFI_ADJUST_CFA_OFFSET -4
	CFI_RESTORE esi
	popl %edi
	CFI_ADJUST_CFA_OFFSET -4
	CFI_RESTORE edi
	popl %ebp
	CFI_ADJUST_CFA_OFFSET -4
	CFI_RESTORE ebp
	popl %eax
	CFI_ADJUST_CFA_OFFSET -4
257
	CFI_RESTORE eax
258
.endm
L
Linus Torvalds 已提交
259

260
.macro RESTORE_REGS pop=0
261 262 263 264 265 266 267 268 269 270
	RESTORE_INT_REGS
1:	popl %ds
	CFI_ADJUST_CFA_OFFSET -4
	/*CFI_RESTORE ds;*/
2:	popl %es
	CFI_ADJUST_CFA_OFFSET -4
	/*CFI_RESTORE es;*/
3:	popl %fs
	CFI_ADJUST_CFA_OFFSET -4
	/*CFI_RESTORE fs;*/
271
	POP_GS \pop
272 273 274 275 276 277 278 279 280 281 282 283
.pushsection .fixup, "ax"
4:	movl $0, (%esp)
	jmp 1b
5:	movl $0, (%esp)
	jmp 2b
6:	movl $0, (%esp)
	jmp 3b
.section __ex_table, "a"
	.align 4
	.long 1b, 4b
	.long 2b, 5b
	.long 3b, 6b
284
.popsection
285
	POP_GS_EX
286
.endm
L
Linus Torvalds 已提交
287

288 289 290 291 292
.macro RING0_INT_FRAME
	CFI_STARTPROC simple
	CFI_SIGNAL_FRAME
	CFI_DEF_CFA esp, 3*4
	/*CFI_OFFSET cs, -2*4;*/
293
	CFI_OFFSET eip, -3*4
294
.endm
295

296 297 298 299 300
.macro RING0_EC_FRAME
	CFI_STARTPROC simple
	CFI_SIGNAL_FRAME
	CFI_DEF_CFA esp, 4*4
	/*CFI_OFFSET cs, -2*4;*/
301
	CFI_OFFSET eip, -3*4
302
.endm
303

304 305 306 307 308 309 310 311 312 313 314 315 316 317
.macro RING0_PTREGS_FRAME
	CFI_STARTPROC simple
	CFI_SIGNAL_FRAME
	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
	CFI_OFFSET eip, PT_EIP-PT_OLDESP
	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
	CFI_OFFSET eax, PT_EAX-PT_OLDESP
	CFI_OFFSET ebp, PT_EBP-PT_OLDESP
	CFI_OFFSET edi, PT_EDI-PT_OLDESP
	CFI_OFFSET esi, PT_ESI-PT_OLDESP
	CFI_OFFSET edx, PT_EDX-PT_OLDESP
	CFI_OFFSET ecx, PT_ECX-PT_OLDESP
318
	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
319
.endm
L
Linus Torvalds 已提交
320 321

ENTRY(ret_from_fork)
322
	CFI_STARTPROC
L
Linus Torvalds 已提交
323
	pushl %eax
324
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
325 326 327
	call schedule_tail
	GET_THREAD_INFO(%ebp)
	popl %eax
328
	CFI_ADJUST_CFA_OFFSET -4
329 330 331 332
	pushl $0x0202			# Reset kernel eflags
	CFI_ADJUST_CFA_OFFSET 4
	popfl
	CFI_ADJUST_CFA_OFFSET -4
L
Linus Torvalds 已提交
333
	jmp syscall_exit
334
	CFI_ENDPROC
335
END(ret_from_fork)
L
Linus Torvalds 已提交
336

337 338 339 340
/*
 * Interrupt exit functions should be protected against kprobes
 */
	.pushsection .kprobes.text, "ax"
L
Linus Torvalds 已提交
341 342 343 344 345 346 347 348 349
/*
 * Return to user mode is not as complex as all this looks,
 * but we want the default path for a system call return to
 * go as quickly as possible which is why some of this is
 * less clear than it otherwise should be.
 */

	# userspace resumption stub bypassing syscall exit tracing
	ALIGN
350
	RING0_PTREGS_FRAME
L
Linus Torvalds 已提交
351
ret_from_exception:
352
	preempt_stop(CLBR_ANY)
L
Linus Torvalds 已提交
353 354
ret_from_intr:
	GET_THREAD_INFO(%ebp)
355
check_userspace:
356 357
	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
	movb PT_CS(%esp), %al
358
	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
359 360
	cmpl $USER_RPL, %eax
	jb resume_kernel		# not returning to v8086 or userspace
361

L
Linus Torvalds 已提交
362
ENTRY(resume_userspace)
363
	LOCKDEP_SYS_EXIT
364
 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
L
Linus Torvalds 已提交
365 366
					# setting need_resched or sigpending
					# between sampling and the iret
367
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
368 369 370 371 372
	movl TI_flags(%ebp), %ecx
	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
					# int/exception return?
	jne work_pending
	jmp restore_all
373
END(ret_from_exception)
L
Linus Torvalds 已提交
374 375 376

#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
377
	DISABLE_INTERRUPTS(CLBR_ANY)
L
Linus Torvalds 已提交
378
	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
379
	jnz restore_all
L
Linus Torvalds 已提交
380 381 382 383
need_resched:
	movl TI_flags(%ebp), %ecx	# need_resched set ?
	testb $_TIF_NEED_RESCHED, %cl
	jz restore_all
384
	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
L
Linus Torvalds 已提交
385 386 387
	jz restore_all
	call preempt_schedule_irq
	jmp need_resched
388
END(resume_kernel)
L
Linus Torvalds 已提交
389
#endif
390
	CFI_ENDPROC
391 392 393 394
/*
 * End of kprobes section
 */
	.popsection
L
Linus Torvalds 已提交
395 396 397 398 399

/* SYSENTER_RETURN points to after the "sysenter" instruction in
   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */

	# sysenter call handler stub
R
Roland McGrath 已提交
400
ENTRY(ia32_sysenter_target)
401
	CFI_STARTPROC simple
402
	CFI_SIGNAL_FRAME
403 404
	CFI_DEF_CFA esp, 0
	CFI_REGISTER esp, ebp
405
	movl TSS_sysenter_sp0(%esp),%esp
L
Linus Torvalds 已提交
406
sysenter_past_esp:
407
	/*
408 409 410
	 * Interrupts are disabled here, but we can't trace it until
	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
	 * we immediately enable interrupts at that point anyway.
411
	 */
L
Linus Torvalds 已提交
412
	pushl $(__USER_DS)
413 414
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET ss, 0*/
L
Linus Torvalds 已提交
415
	pushl %ebp
416 417
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET esp, 0
L
Linus Torvalds 已提交
418
	pushfl
419
	orl $X86_EFLAGS_IF, (%esp)
420
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
421
	pushl $(__USER_CS)
422 423
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET cs, 0*/
424 425 426 427 428 429
	/*
	 * Push current_thread_info()->sysenter_return to the stack.
	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
	 */
	pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
430 431
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET eip, 0
L
Linus Torvalds 已提交
432

433 434 435 436 437
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL
	ENABLE_INTERRUPTS(CLBR_NONE)

L
Linus Torvalds 已提交
438 439 440 441 442 443 444
/*
 * Load the potential sixth argument from user stack.
 * Careful about security.
 */
	cmpl $__PAGE_OFFSET-3,%ebp
	jae syscall_fault
1:	movl (%ebp),%ebp
445
	movl %ebp,PT_EBP(%esp)
L
Linus Torvalds 已提交
446 447 448 449 450 451 452
.section __ex_table,"a"
	.align 4
	.long 1b,syscall_fault
.previous

	GET_THREAD_INFO(%ebp)

453
	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
R
Roland McGrath 已提交
454 455
	jnz sysenter_audit
sysenter_do_call:
L
Linus Torvalds 已提交
456 457 458
	cmpl $(nr_syscalls), %eax
	jae syscall_badsys
	call *sys_call_table(,%eax,4)
459
	movl %eax,PT_EAX(%esp)
460
	LOCKDEP_SYS_EXIT
461
	DISABLE_INTERRUPTS(CLBR_ANY)
462
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
463
	movl TI_flags(%ebp), %ecx
464
	testl $_TIF_ALLWORK_MASK, %ecx
R
Roland McGrath 已提交
465 466
	jne sysexit_audit
sysenter_exit:
L
Linus Torvalds 已提交
467
/* if something modifies registers it must also disable sysexit */
468 469
	movl PT_EIP(%esp), %edx
	movl PT_OLDESP(%esp), %ecx
L
Linus Torvalds 已提交
470
	xorl %ebp,%ebp
471
	TRACE_IRQS_ON
472
1:	mov  PT_FS(%esp), %fs
473
	PTGS_TO_GS
474
	ENABLE_INTERRUPTS_SYSEXIT
R
Roland McGrath 已提交
475 476 477

#ifdef CONFIG_AUDITSYSCALL
sysenter_audit:
478
	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
R
Roland McGrath 已提交
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
	jnz syscall_trace_entry
	addl $4,%esp
	CFI_ADJUST_CFA_OFFSET -4
	/* %esi already in 8(%esp)	   6th arg: 4th syscall arg */
	/* %edx already in 4(%esp)	   5th arg: 3rd syscall arg */
	/* %ecx already in 0(%esp)	   4th arg: 2nd syscall arg */
	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
	movl %eax,%edx			/* 2nd arg: syscall number */
	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
	call audit_syscall_entry
	pushl %ebx
	CFI_ADJUST_CFA_OFFSET 4
	movl PT_EAX(%esp),%eax		/* reload syscall number */
	jmp sysenter_do_call

sysexit_audit:
495
	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
R
Roland McGrath 已提交
496 497 498 499 500 501 502 503 504 505 506 507
	jne syscall_exit_work
	TRACE_IRQS_ON
	ENABLE_INTERRUPTS(CLBR_ANY)
	movl %eax,%edx		/* second arg, syscall return value */
	cmpl $0,%eax		/* is it < 0? */
	setl %al		/* 1 if so, 0 if not */
	movzbl %al,%eax		/* zero-extend that */
	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
	call audit_syscall_exit
	DISABLE_INTERRUPTS(CLBR_ANY)
	TRACE_IRQS_OFF
	movl TI_flags(%ebp), %ecx
508
	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
R
Roland McGrath 已提交
509 510 511 512 513
	jne syscall_exit_work
	movl PT_EAX(%esp),%eax	/* reload syscall return value */
	jmp sysenter_exit
#endif

514
	CFI_ENDPROC
515
.pushsection .fixup,"ax"
516
2:	movl $0,PT_FS(%esp)
517 518 519 520 521
	jmp 1b
.section __ex_table,"a"
	.align 4
	.long 1b,2b
.popsection
522
	PTGS_TO_GS_EX
R
Roland McGrath 已提交
523
ENDPROC(ia32_sysenter_target)
L
Linus Torvalds 已提交
524

525 526 527 528
/*
 * syscall stub including irq exit should be protected against kprobes
 */
	.pushsection .kprobes.text, "ax"
L
Linus Torvalds 已提交
529 530
	# system call handler stub
ENTRY(system_call)
531
	RING0_INT_FRAME			# can't unwind into user space anyway
L
Linus Torvalds 已提交
532
	pushl %eax			# save orig_eax
533
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
534 535
	SAVE_ALL
	GET_THREAD_INFO(%ebp)
536
					# system call tracing in operation / emulation
537
	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
L
Linus Torvalds 已提交
538 539 540 541 542
	jnz syscall_trace_entry
	cmpl $(nr_syscalls), %eax
	jae syscall_badsys
syscall_call:
	call *sys_call_table(,%eax,4)
543
	movl %eax,PT_EAX(%esp)		# store the return value
L
Linus Torvalds 已提交
544
syscall_exit:
545
	LOCKDEP_SYS_EXIT
546
	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
L
Linus Torvalds 已提交
547 548
					# setting need_resched or sigpending
					# between sampling and the iret
549
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
550
	movl TI_flags(%ebp), %ecx
551
	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
L
Linus Torvalds 已提交
552 553 554
	jne syscall_exit_work

restore_all:
555 556
	TRACE_IRQS_IRET
restore_all_notrace:
557 558
	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
559 560
	# are returning to the kernel.
	# See comments in process.c:copy_thread() for details.
561 562
	movb PT_OLDSS(%esp), %ah
	movb PT_CS(%esp), %al
563
	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
564
	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
565
	CFI_REMEMBER_STATE
L
Linus Torvalds 已提交
566 567
	je ldt_ss			# returning to user-space with LDT SS
restore_nocheck:
568
	RESTORE_REGS 4			# skip orig_eax/error_code
569
	CFI_ADJUST_CFA_OFFSET -4
A
Adrian Bunk 已提交
570
irq_return:
I
Ingo Molnar 已提交
571
	INTERRUPT_RETURN
L
Linus Torvalds 已提交
572
.section .fixup,"ax"
573
ENTRY(iret_exc)
574 575 576
	pushl $0			# no error code
	pushl $do_iret_error
	jmp error_code
L
Linus Torvalds 已提交
577 578 579
.previous
.section __ex_table,"a"
	.align 4
I
Ingo Molnar 已提交
580
	.long irq_return,iret_exc
L
Linus Torvalds 已提交
581 582
.previous

583
	CFI_RESTORE_STATE
L
Linus Torvalds 已提交
584
ldt_ss:
585
	larl PT_OLDSS(%esp), %eax
L
Linus Torvalds 已提交
586 587 588
	jnz restore_nocheck
	testl $0x00400000, %eax		# returning to 32bit stack?
	jnz restore_nocheck		# allright, normal return
589 590 591 592 593 594 595 596 597 598

#ifdef CONFIG_PARAVIRT
	/*
	 * The kernel can't run on a non-flat stack if paravirt mode
	 * is active.  Rather than try to fixup the high bits of
	 * ESP, bypass this code entirely.  This may break DOSemu
	 * and/or Wine support in a paravirt VM, although the option
	 * is still available to implement the setting of the high
	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
	 */
599
	cmpl $0, pv_info+PARAVIRT_enabled
600 601 602
	jne restore_nocheck
#endif

603 604 605 606 607 608 609 610 611 612 613
/*
 * Setup and switch to ESPFIX stack
 *
 * We're returning to userspace with a 16 bit stack. The CPU will not
 * restore the high word of ESP for us on executing iret... This is an
 * "official" bug of all the x86-compatible CPUs, which we can work
 * around to make dosemu and wine happy. We do this by preloading the
 * high word of ESP with the high word of the userspace ESP while
 * compensating for the offset by changing to the ESPFIX segment with
 * a base address that matches for the difference.
 */
614
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
615 616 617 618 619
	mov %esp, %edx			/* load kernel esp */
	mov PT_OLDESP(%esp), %eax	/* load userspace esp */
	mov %dx, %ax			/* eax: new kernel esp */
	sub %eax, %edx			/* offset (low word is 0) */
	shr $16, %edx
620 621
	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
S
Stas Sergeev 已提交
622 623
	pushl $__ESPFIX_SS
	CFI_ADJUST_CFA_OFFSET 4
624
	push %eax			/* new kernel esp */
S
Stas Sergeev 已提交
625
	CFI_ADJUST_CFA_OFFSET 4
626 627 628
	/* Disable interrupts, but do not irqtrace this section: we
	 * will soon execute iret and the tracer was already set to
	 * the irqstate after the iret */
629
	DISABLE_INTERRUPTS(CLBR_EAX)
630
	lss (%esp), %esp		/* switch to espfix segment */
S
Stas Sergeev 已提交
631 632
	CFI_ADJUST_CFA_OFFSET -8
	jmp restore_nocheck
633
	CFI_ENDPROC
634
ENDPROC(system_call)
L
Linus Torvalds 已提交
635 636 637

	# perform work that needs to be done immediately before resumption
	ALIGN
638
	RING0_PTREGS_FRAME		# can't unwind into user space anyway
L
Linus Torvalds 已提交
639 640 641 642 643
work_pending:
	testb $_TIF_NEED_RESCHED, %cl
	jz work_notifysig
work_resched:
	call schedule
644
	LOCKDEP_SYS_EXIT
645
	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
L
Linus Torvalds 已提交
646 647
					# setting need_resched or sigpending
					# between sampling and the iret
648
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
649 650 651 652 653 654 655 656 657
	movl TI_flags(%ebp), %ecx
	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
					# than syscall tracing?
	jz restore_all
	testb $_TIF_NEED_RESCHED, %cl
	jnz work_resched

work_notifysig:				# deal with pending signals and
					# notify-resume requests
658
#ifdef CONFIG_VM86
659
	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
L
Linus Torvalds 已提交
660 661 662 663 664
	movl %esp, %eax
	jne work_notifysig_v86		# returning to kernel-space or
					# vm86-space
	xorl %edx, %edx
	call do_notify_resume
665
	jmp resume_userspace_sig
L
Linus Torvalds 已提交
666 667 668 669

	ALIGN
work_notifysig_v86:
	pushl %ecx			# save ti_flags for do_notify_resume
670
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
671 672
	call save_v86_state		# %eax contains pt_regs pointer
	popl %ecx
673
	CFI_ADJUST_CFA_OFFSET -4
L
Linus Torvalds 已提交
674
	movl %eax, %esp
675 676 677
#else
	movl %esp, %eax
#endif
L
Linus Torvalds 已提交
678 679
	xorl %edx, %edx
	call do_notify_resume
680
	jmp resume_userspace_sig
681
END(work_pending)
L
Linus Torvalds 已提交
682 683 684 685

	# perform syscall exit tracing
	ALIGN
syscall_trace_entry:
686
	movl $-ENOSYS,PT_EAX(%esp)
L
Linus Torvalds 已提交
687
	movl %esp, %eax
688 689
	call syscall_trace_enter
	/* What it returned is what we'll actually use.  */
L
Linus Torvalds 已提交
690 691 692
	cmpl $(nr_syscalls), %eax
	jnae syscall_call
	jmp syscall_exit
693
END(syscall_trace_entry)
L
Linus Torvalds 已提交
694 695 696 697

	# perform syscall exit tracing
	ALIGN
syscall_exit_work:
698
	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
L
Linus Torvalds 已提交
699
	jz work_pending
700
	TRACE_IRQS_ON
701
	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
L
Linus Torvalds 已提交
702 703
					# schedule() instead
	movl %esp, %eax
704
	call syscall_trace_leave
L
Linus Torvalds 已提交
705
	jmp resume_userspace
706
END(syscall_exit_work)
707
	CFI_ENDPROC
L
Linus Torvalds 已提交
708

709
	RING0_INT_FRAME			# can't unwind into user space anyway
L
Linus Torvalds 已提交
710 711
syscall_fault:
	GET_THREAD_INFO(%ebp)
712
	movl $-EFAULT,PT_EAX(%esp)
L
Linus Torvalds 已提交
713
	jmp resume_userspace
714
END(syscall_fault)
L
Linus Torvalds 已提交
715 716

syscall_badsys:
717
	movl $-ENOSYS,PT_EAX(%esp)
L
Linus Torvalds 已提交
718
	jmp resume_userspace
719
END(syscall_badsys)
720
	CFI_ENDPROC
721 722 723 724
/*
 * End of kprobes section
 */
	.popsection
L
Linus Torvalds 已提交
725

726 727 728
/*
 * System calls that need a pt_regs pointer.
 */
B
Brian Gerst 已提交
729
#define PTREGSCALL0(name) \
730 731 732 733 734
	ALIGN; \
ptregs_##name: \
	leal 4(%esp),%eax; \
	jmp sys_##name;

B
Brian Gerst 已提交
735 736 737 738
#define PTREGSCALL1(name) \
	ALIGN; \
ptregs_##name: \
	leal 4(%esp),%edx; \
739
	movl (PT_EBX+4)(%esp),%eax; \
B
Brian Gerst 已提交
740 741 742 743 744 745
	jmp sys_##name;

#define PTREGSCALL2(name) \
	ALIGN; \
ptregs_##name: \
	leal 4(%esp),%ecx; \
746 747
	movl (PT_ECX+4)(%esp),%edx; \
	movl (PT_EBX+4)(%esp),%eax; \
B
Brian Gerst 已提交
748 749 750 751 752 753 754 755 756 757 758 759 760 761
	jmp sys_##name;

#define PTREGSCALL3(name) \
	ALIGN; \
ptregs_##name: \
	leal 4(%esp),%eax; \
	pushl %eax; \
	movl PT_EDX(%eax),%ecx; \
	movl PT_ECX(%eax),%edx; \
	movl PT_EBX(%eax),%eax; \
	call sys_##name; \
	addl $4,%esp; \
	ret

B
Brian Gerst 已提交
762
PTREGSCALL1(iopl)
B
Brian Gerst 已提交
763 764
PTREGSCALL0(fork)
PTREGSCALL0(vfork)
B
Brian Gerst 已提交
765
PTREGSCALL3(execve)
B
Brian Gerst 已提交
766
PTREGSCALL2(sigaltstack)
B
Brian Gerst 已提交
767 768
PTREGSCALL0(sigreturn)
PTREGSCALL0(rt_sigreturn)
769 770
PTREGSCALL2(vm86)
PTREGSCALL1(vm86old)
771

B
Brian Gerst 已提交
772 773 774 775 776 777 778 779 780 781 782 783 784
/* Clone is an oddball.  The 4th arg is in %edi */
	ALIGN;
ptregs_clone:
	leal 4(%esp),%eax
	pushl %eax
	pushl PT_EDI(%eax)
	movl PT_EDX(%eax),%ecx
	movl PT_ECX(%eax),%edx
	movl PT_EBX(%eax),%eax
	call sys_clone
	addl $8,%esp
	ret

785
.macro FIXUP_ESPFIX_STACK
786 787 788 789 790 791 792 793
/*
 * Switch back for ESPFIX stack to the normal zerobased stack
 *
 * We can't call C functions using the ESPFIX stack. This code reads
 * the high word of the segment base from the GDT and swiches to the
 * normal stack and adjusts ESP with the matching offset.
 */
	/* fixup the stack */
794 795
	mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 797
	shl $16, %eax
	addl %esp, %eax			/* the adjusted stack pointer */
798 799 800 801
	pushl $__KERNEL_DS
	CFI_ADJUST_CFA_OFFSET 4
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
802
	lss (%esp), %esp		/* switch to the normal stack segment */
803 804 805 806 807 808 809 810 811 812 813 814 815 816
	CFI_ADJUST_CFA_OFFSET -8
.endm
.macro UNWIND_ESPFIX_STACK
	movl %ss, %eax
	/* see if on espfix stack */
	cmpw $__ESPFIX_SS, %ax
	jne 27f
	movl $__KERNEL_DS, %eax
	movl %eax, %ds
	movl %eax, %es
	/* switch to normal stack */
	FIXUP_ESPFIX_STACK
27:
.endm
L
Linus Torvalds 已提交
817 818

/*
819 820 821
 * Build the entry stubs and pointer table with some assembler magic.
 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
 * single cache line on all modern x86 implementations.
L
Linus Torvalds 已提交
822
 */
823
.section .init.rodata,"a"
L
Linus Torvalds 已提交
824 825
ENTRY(interrupt)
.text
826 827
	.p2align 5
	.p2align CONFIG_X86_L1_CACHE_SHIFT
L
Linus Torvalds 已提交
828
ENTRY(irq_entries_start)
829
	RING0_INT_FRAME
830
vector=FIRST_EXTERNAL_VECTOR
831 832 833 834
.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
	.balign 32
  .rept	7
    .if vector < NR_VECTORS
835
      .if vector <> FIRST_EXTERNAL_VECTOR
836
	CFI_ADJUST_CFA_OFFSET -4
837 838
      .endif
1:	pushl $(~vector+0x80)	/* Note: always in signed byte range */
839
	CFI_ADJUST_CFA_OFFSET 4
840
      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
841 842 843
	jmp 2f
      .endif
      .previous
L
Linus Torvalds 已提交
844
	.long 1b
845
      .text
L
Linus Torvalds 已提交
846
vector=vector+1
847 848 849
    .endif
  .endr
2:	jmp common_interrupt
L
Linus Torvalds 已提交
850
.endr
851 852 853 854 855
END(irq_entries_start)

.previous
END(interrupt)
.previous
L
Linus Torvalds 已提交
856

857 858 859 860
/*
 * the CPU automatically disables interrupts when executing an IRQ vector,
 * so IRQ-flags tracing has to follow that:
 */
861
	.p2align CONFIG_X86_L1_CACHE_SHIFT
L
Linus Torvalds 已提交
862
common_interrupt:
863
	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */
L
Linus Torvalds 已提交
864
	SAVE_ALL
865
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
866 867 868
	movl %esp,%eax
	call do_IRQ
	jmp ret_from_intr
869
ENDPROC(common_interrupt)
870
	CFI_ENDPROC
L
Linus Torvalds 已提交
871

872 873 874 875
/*
 *  Irq entries should be protected against kprobes
 */
	.pushsection .kprobes.text, "ax"
T
Tejun Heo 已提交
876
#define BUILD_INTERRUPT3(name, nr, fn)	\
L
Linus Torvalds 已提交
877
ENTRY(name)				\
878
	RING0_INT_FRAME;		\
879
	pushl $~(nr);			\
880 881
	CFI_ADJUST_CFA_OFFSET 4;	\
	SAVE_ALL;			\
882
	TRACE_IRQS_OFF			\
L
Linus Torvalds 已提交
883
	movl %esp,%eax;			\
T
Tejun Heo 已提交
884
	call fn;			\
885
	jmp ret_from_intr;		\
886 887
	CFI_ENDPROC;			\
ENDPROC(name)
L
Linus Torvalds 已提交
888

T
Tejun Heo 已提交
889 890
#define BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(name, nr, smp_##name)

L
Linus Torvalds 已提交
891
/* The include is where all of the SMP etc. interrupts come from */
892
#include <asm/entry_arch.h>
L
Linus Torvalds 已提交
893 894

ENTRY(coprocessor_error)
895
	RING0_INT_FRAME
L
Linus Torvalds 已提交
896
	pushl $0
897
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
898
	pushl $do_coprocessor_error
899
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
900
	jmp error_code
901
	CFI_ENDPROC
902
END(coprocessor_error)
L
Linus Torvalds 已提交
903 904

ENTRY(simd_coprocessor_error)
905
	RING0_INT_FRAME
L
Linus Torvalds 已提交
906
	pushl $0
907
	CFI_ADJUST_CFA_OFFSET 4
908 909 910 911 912 913 914 915
#ifdef CONFIG_X86_INVD_BUG
	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
661:	pushl $do_general_protection
662:
.section .altinstructions,"a"
	.balign 4
	.long 661b
	.long 663f
916
	.word X86_FEATURE_XMM
917 918 919 920 921 922 923 924
	.byte 662b-661b
	.byte 664f-663f
.previous
.section .altinstr_replacement,"ax"
663:	pushl $do_simd_coprocessor_error
664:
.previous
#else
L
Linus Torvalds 已提交
925
	pushl $do_simd_coprocessor_error
926
#endif
927
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
928
	jmp error_code
929
	CFI_ENDPROC
930
END(simd_coprocessor_error)
L
Linus Torvalds 已提交
931 932

ENTRY(device_not_available)
933
	RING0_INT_FRAME
L
Linus Torvalds 已提交
934
	pushl $-1			# mark this as an int
935
	CFI_ADJUST_CFA_OFFSET 4
936
	pushl $do_device_not_available
937
	CFI_ADJUST_CFA_OFFSET 4
938
	jmp error_code
939
	CFI_ENDPROC
940
END(device_not_available)
L
Linus Torvalds 已提交
941

942 943
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
I
Ingo Molnar 已提交
944
	iret
945 946
.section __ex_table,"a"
	.align 4
I
Ingo Molnar 已提交
947
	.long native_iret, iret_exc
948
.previous
949
END(native_iret)
950

951
ENTRY(native_irq_enable_sysexit)
952 953
	sti
	sysexit
954
END(native_irq_enable_sysexit)
955 956
#endif

L
Linus Torvalds 已提交
957
ENTRY(overflow)
958
	RING0_INT_FRAME
L
Linus Torvalds 已提交
959
	pushl $0
960
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
961
	pushl $do_overflow
962
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
963
	jmp error_code
964
	CFI_ENDPROC
965
END(overflow)
L
Linus Torvalds 已提交
966 967

ENTRY(bounds)
968
	RING0_INT_FRAME
L
Linus Torvalds 已提交
969
	pushl $0
970
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
971
	pushl $do_bounds
972
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
973
	jmp error_code
974
	CFI_ENDPROC
975
END(bounds)
L
Linus Torvalds 已提交
976 977

ENTRY(invalid_op)
978
	RING0_INT_FRAME
L
Linus Torvalds 已提交
979
	pushl $0
980
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
981
	pushl $do_invalid_op
982
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
983
	jmp error_code
984
	CFI_ENDPROC
985
END(invalid_op)
L
Linus Torvalds 已提交
986 987

ENTRY(coprocessor_segment_overrun)
988
	RING0_INT_FRAME
L
Linus Torvalds 已提交
989
	pushl $0
990
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
991
	pushl $do_coprocessor_segment_overrun
992
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
993
	jmp error_code
994
	CFI_ENDPROC
995
END(coprocessor_segment_overrun)
L
Linus Torvalds 已提交
996 997

ENTRY(invalid_TSS)
998
	RING0_EC_FRAME
L
Linus Torvalds 已提交
999
	pushl $do_invalid_TSS
1000
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
1001
	jmp error_code
1002
	CFI_ENDPROC
1003
END(invalid_TSS)
L
Linus Torvalds 已提交
1004 1005

ENTRY(segment_not_present)
1006
	RING0_EC_FRAME
L
Linus Torvalds 已提交
1007
	pushl $do_segment_not_present
1008
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
1009
	jmp error_code
1010
	CFI_ENDPROC
1011
END(segment_not_present)
L
Linus Torvalds 已提交
1012 1013

ENTRY(stack_segment)
1014
	RING0_EC_FRAME
L
Linus Torvalds 已提交
1015
	pushl $do_stack_segment
1016
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
1017
	jmp error_code
1018
	CFI_ENDPROC
1019
END(stack_segment)
L
Linus Torvalds 已提交
1020 1021

ENTRY(alignment_check)
1022
	RING0_EC_FRAME
L
Linus Torvalds 已提交
1023
	pushl $do_alignment_check
1024
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
1025
	jmp error_code
1026
	CFI_ENDPROC
1027
END(alignment_check)
L
Linus Torvalds 已提交
1028

1029 1030 1031 1032 1033
ENTRY(divide_error)
	RING0_INT_FRAME
	pushl $0			# no error code
	CFI_ADJUST_CFA_OFFSET 4
	pushl $do_divide_error
1034
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
1035
	jmp error_code
1036
	CFI_ENDPROC
1037
END(divide_error)
L
Linus Torvalds 已提交
1038 1039 1040

#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
1041
	RING0_INT_FRAME
L
Linus Torvalds 已提交
1042
	pushl $0
1043
	CFI_ADJUST_CFA_OFFSET 4
1044
	pushl machine_check_vector
1045
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
1046
	jmp error_code
1047
	CFI_ENDPROC
1048
END(machine_check)
L
Linus Torvalds 已提交
1049 1050 1051
#endif

ENTRY(spurious_interrupt_bug)
1052
	RING0_INT_FRAME
L
Linus Torvalds 已提交
1053
	pushl $0
1054
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
1055
	pushl $do_spurious_interrupt_bug
1056
	CFI_ADJUST_CFA_OFFSET 4
L
Linus Torvalds 已提交
1057
	jmp error_code
1058
	CFI_ENDPROC
1059
END(spurious_interrupt_bug)
1060 1061 1062 1063
/*
 * End of kprobes section
 */
	.popsection
L
Linus Torvalds 已提交
1064

1065 1066 1067
ENTRY(kernel_thread_helper)
	pushl $0		# fake return address for unwinder
	CFI_STARTPROC
1068 1069
	movl %edi,%eax
	call *%esi
1070
	call do_exit
1071
	ud2			# padding for call trace
1072 1073 1074
	CFI_ENDPROC
ENDPROC(kernel_thread_helper)

1075
#ifdef CONFIG_XEN
1076 1077 1078 1079 1080
/* Xen doesn't set %esp to be precisely what the normal sysenter
   entrypoint expects, so fix it up before using the normal path. */
ENTRY(xen_sysenter_target)
	RING0_INT_FRAME
	addl $5*4, %esp		/* remove xen-provided frame */
1081
	CFI_ADJUST_CFA_OFFSET -5*4
1082
	jmp sysenter_past_esp
G
Glauber Costa 已提交
1083
	CFI_ENDPROC
1084

1085 1086 1087 1088 1089 1090
ENTRY(xen_hypervisor_callback)
	CFI_STARTPROC
	pushl $0
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL
	TRACE_IRQS_OFF
1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102

	/* Check to see if we got the event in the critical
	   region in xen_iret_direct, after we've reenabled
	   events and checked for pending events.  This simulates
	   iret instruction's behaviour where it delivers a
	   pending interrupt when enabling interrupts. */
	movl PT_EIP(%esp),%eax
	cmpl $xen_iret_start_crit,%eax
	jb   1f
	cmpl $xen_iret_end_crit,%eax
	jae  1f

J
Jeremy Fitzhardinge 已提交
1103
	jmp  xen_iret_crit_fixup
1104 1105

ENTRY(xen_do_upcall)
1106
1:	mov %esp, %eax
1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167
	call xen_evtchn_do_upcall
	jmp  ret_from_intr
	CFI_ENDPROC
ENDPROC(xen_hypervisor_callback)

# Hypervisor uses this for application faults while it executes.
# We get here for two reasons:
#  1. Fault while reloading DS, ES, FS or GS
#  2. Fault while executing IRET
# Category 1 we fix up by reattempting the load, and zeroing the segment
# register if the load fails.
# Category 2 we fix up by jumping to do_iret_error. We cannot use the
# normal Linux return path in this case because if we use the IRET hypercall
# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
	CFI_STARTPROC
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	movl $1,%eax
1:	mov 4(%esp),%ds
2:	mov 8(%esp),%es
3:	mov 12(%esp),%fs
4:	mov 16(%esp),%gs
	testl %eax,%eax
	popl %eax
	CFI_ADJUST_CFA_OFFSET -4
	lea 16(%esp),%esp
	CFI_ADJUST_CFA_OFFSET -16
	jz 5f
	addl $16,%esp
	jmp iret_exc		# EAX != 0 => Category 2 (Bad IRET)
5:	pushl $0		# EAX == 0 => Category 1 (Bad segment)
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL
	jmp ret_from_exception
	CFI_ENDPROC

.section .fixup,"ax"
6:	xorl %eax,%eax
	movl %eax,4(%esp)
	jmp 1b
7:	xorl %eax,%eax
	movl %eax,8(%esp)
	jmp 2b
8:	xorl %eax,%eax
	movl %eax,12(%esp)
	jmp 3b
9:	xorl %eax,%eax
	movl %eax,16(%esp)
	jmp 4b
.previous
.section __ex_table,"a"
	.align 4
	.long 1b,6b
	.long 2b,7b
	.long 3b,8b
	.long 4b,9b
.previous
ENDPROC(xen_failsafe_callback)

1168 1169 1170
BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
		xen_evtchn_do_upcall)

1171 1172
#endif	/* CONFIG_XEN */

1173
#ifdef CONFIG_FUNCTION_TRACER
1174 1175 1176 1177 1178 1179 1180
#ifdef CONFIG_DYNAMIC_FTRACE

ENTRY(mcount)
	ret
END(mcount)

ENTRY(ftrace_caller)
1181 1182 1183
	cmpl $0, function_trace_stop
	jne  ftrace_stub

1184 1185 1186 1187 1188
	pushl %eax
	pushl %ecx
	pushl %edx
	movl 0xc(%esp), %eax
	movl 0x4(%ebp), %edx
1189
	subl $MCOUNT_INSN_SIZE, %eax
1190 1191 1192 1193 1194 1195 1196 1197

.globl ftrace_call
ftrace_call:
	call ftrace_stub

	popl %edx
	popl %ecx
	popl %eax
1198 1199 1200 1201 1202
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.globl ftrace_graph_call
ftrace_graph_call:
	jmp ftrace_stub
#endif
1203 1204 1205 1206 1207 1208 1209 1210

.globl ftrace_stub
ftrace_stub:
	ret
END(ftrace_caller)

#else /* ! CONFIG_DYNAMIC_FTRACE */

1211
ENTRY(mcount)
1212 1213 1214
	cmpl $0, function_trace_stop
	jne  ftrace_stub

1215 1216
	cmpl $ftrace_stub, ftrace_trace_function
	jnz trace
1217
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
I
Ingo Molnar 已提交
1218
	cmpl $ftrace_stub, ftrace_graph_return
1219
	jnz ftrace_graph_caller
1220 1221 1222

	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
	jnz ftrace_graph_caller
1223
#endif
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234
.globl ftrace_stub
ftrace_stub:
	ret

	/* taken from glibc */
trace:
	pushl %eax
	pushl %ecx
	pushl %edx
	movl 0xc(%esp), %eax
	movl 0x4(%ebp), %edx
1235
	subl $MCOUNT_INSN_SIZE, %eax
1236

1237
	call *ftrace_trace_function
1238 1239 1240 1241 1242 1243

	popl %edx
	popl %ecx
	popl %eax
	jmp ftrace_stub
END(mcount)
1244
#endif /* CONFIG_DYNAMIC_FTRACE */
1245
#endif /* CONFIG_FUNCTION_TRACER */
1246

1247 1248
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
1249 1250 1251
	cmpl $0, function_trace_stop
	jne ftrace_stub

1252 1253 1254
	pushl %eax
	pushl %ecx
	pushl %edx
1255
	movl 0xc(%esp), %edx
1256
	lea 0x4(%ebp), %eax
1257
	movl (%ebp), %ecx
1258
	subl $MCOUNT_INSN_SIZE, %edx
1259 1260 1261 1262
	call prepare_ftrace_return
	popl %edx
	popl %ecx
	popl %eax
1263
	ret
1264
END(ftrace_graph_caller)
1265 1266 1267 1268 1269

.globl return_to_handler
return_to_handler:
	pushl %eax
	pushl %edx
1270
	movl %ebp, %eax
1271
	call ftrace_return_to_handler
1272
	movl %eax, %ecx
1273 1274
	popl %edx
	popl %eax
1275
	jmp *%ecx
1276
#endif
1277

1278
.section .rodata,"a"
1279
#include "syscall_table_32.S"
L
Linus Torvalds 已提交
1280 1281

syscall_table_size=(.-sys_call_table)
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293

/*
 * Some functions should be protected against kprobes
 */
	.pushsection .kprobes.text, "ax"

ENTRY(page_fault)
	RING0_EC_FRAME
	pushl $do_page_fault
	CFI_ADJUST_CFA_OFFSET 4
	ALIGN
error_code:
1294 1295 1296 1297
	/* the function address is in %gs's slot on the stack */
	pushl %fs
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET fs, 0*/
1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328
	pushl %es
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET es, 0*/
	pushl %ds
	CFI_ADJUST_CFA_OFFSET 4
	/*CFI_REL_OFFSET ds, 0*/
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET eax, 0
	pushl %ebp
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET ebp, 0
	pushl %edi
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET edi, 0
	pushl %esi
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET esi, 0
	pushl %edx
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET edx, 0
	pushl %ecx
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET ecx, 0
	pushl %ebx
	CFI_ADJUST_CFA_OFFSET 4
	CFI_REL_OFFSET ebx, 0
	cld
	movl $(__KERNEL_PERCPU), %ecx
	movl %ecx, %fs
	UNWIND_ESPFIX_STACK
1329 1330
	GS_TO_REG %ecx
	movl PT_GS(%esp), %edi		# get the function address
1331 1332
	movl PT_ORIG_EAX(%esp), %edx	# get the error code
	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
1333 1334
	REG_TO_PTGS %ecx
	SET_KERNEL_GS %ecx
1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357
	movl $(__USER_DS), %ecx
	movl %ecx, %ds
	movl %ecx, %es
	TRACE_IRQS_OFF
	movl %esp,%eax			# pt_regs pointer
	call *%edi
	jmp ret_from_exception
	CFI_ENDPROC
END(page_fault)

/*
 * Debug traps and NMI can happen at the one SYSENTER instruction
 * that sets up the real kernel stack. Check here, since we can't
 * allow the wrong stack to be used.
 *
 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
 * already pushed 3 words if it hits on the sysenter instruction:
 * eflags, cs and eip.
 *
 * We just load the right stack, and push the three (known) values
 * by hand onto the new stack - while updating the return eip past
 * the instruction that would have done it for sysenter.
 */
1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370
.macro FIX_STACK offset ok label
	cmpw $__KERNEL_CS, 4(%esp)
	jne \ok
\label:
	movl TSS_sysenter_sp0 + \offset(%esp), %esp
	CFI_DEF_CFA esp, 0
	CFI_UNDEFINED eip
	pushfl
	CFI_ADJUST_CFA_OFFSET 4
	pushl $__KERNEL_CS
	CFI_ADJUST_CFA_OFFSET 4
	pushl $sysenter_past_esp
	CFI_ADJUST_CFA_OFFSET 4
1371
	CFI_REL_OFFSET eip, 0
1372
.endm
1373 1374 1375 1376 1377

ENTRY(debug)
	RING0_INT_FRAME
	cmpl $ia32_sysenter_target,(%esp)
	jne debug_stack_correct
1378
	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430
debug_stack_correct:
	pushl $-1			# mark this as an int
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL
	TRACE_IRQS_OFF
	xorl %edx,%edx			# error code 0
	movl %esp,%eax			# pt_regs pointer
	call do_debug
	jmp ret_from_exception
	CFI_ENDPROC
END(debug)

/*
 * NMI is doubly nasty. It can happen _while_ we're handling
 * a debug fault, and the debug fault hasn't yet been able to
 * clear up the stack. So we first check whether we got  an
 * NMI on the sysenter entry path, but after that we need to
 * check whether we got an NMI on the debug path where the debug
 * fault happened on the sysenter path.
 */
ENTRY(nmi)
	RING0_INT_FRAME
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	movl %ss, %eax
	cmpw $__ESPFIX_SS, %ax
	popl %eax
	CFI_ADJUST_CFA_OFFSET -4
	je nmi_espfix_stack
	cmpl $ia32_sysenter_target,(%esp)
	je nmi_stack_fixup
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	movl %esp,%eax
	/* Do not access memory above the end of our stack page,
	 * it might not exist.
	 */
	andl $(THREAD_SIZE-1),%eax
	cmpl $(THREAD_SIZE-20),%eax
	popl %eax
	CFI_ADJUST_CFA_OFFSET -4
	jae nmi_stack_correct
	cmpl $ia32_sysenter_target,12(%esp)
	je nmi_debug_stack_check
nmi_stack_correct:
	/* We have a RING0_INT_FRAME here */
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL
	xorl %edx,%edx		# zero error code
	movl %esp,%eax		# pt_regs pointer
	call do_nmi
1431
	jmp restore_all_notrace
1432 1433 1434 1435
	CFI_ENDPROC

nmi_stack_fixup:
	RING0_INT_FRAME
1436
	FIX_STACK 12, nmi_stack_correct, 1
1437 1438 1439 1440 1441 1442 1443 1444 1445 1446
	jmp nmi_stack_correct

nmi_debug_stack_check:
	/* We have a RING0_INT_FRAME here */
	cmpw $__KERNEL_CS,16(%esp)
	jne nmi_stack_correct
	cmpl $debug,(%esp)
	jb nmi_stack_correct
	cmpl $debug_esp_fix_insn,(%esp)
	ja nmi_stack_correct
1447
	FIX_STACK 24, nmi_stack_correct, 1
1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458
	jmp nmi_stack_correct

nmi_espfix_stack:
	/* We have a RING0_INT_FRAME here.
	 *
	 * create the pointer to lss back
	 */
	pushl %ss
	CFI_ADJUST_CFA_OFFSET 4
	pushl %esp
	CFI_ADJUST_CFA_OFFSET 4
1459
	addl $4, (%esp)
1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502
	/* copy the iret frame of 12 bytes */
	.rept 3
	pushl 16(%esp)
	CFI_ADJUST_CFA_OFFSET 4
	.endr
	pushl %eax
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL
	FIXUP_ESPFIX_STACK		# %eax == %esp
	xorl %edx,%edx			# zero error code
	call do_nmi
	RESTORE_REGS
	lss 12+4(%esp), %esp		# back to espfix stack
	CFI_ADJUST_CFA_OFFSET -24
	jmp irq_return
	CFI_ENDPROC
END(nmi)

ENTRY(int3)
	RING0_INT_FRAME
	pushl $-1			# mark this as an int
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL
	TRACE_IRQS_OFF
	xorl %edx,%edx		# zero error code
	movl %esp,%eax		# pt_regs pointer
	call do_int3
	jmp ret_from_exception
	CFI_ENDPROC
END(int3)

ENTRY(general_protection)
	RING0_EC_FRAME
	pushl $do_general_protection
	CFI_ADJUST_CFA_OFFSET 4
	jmp error_code
	CFI_ENDPROC
END(general_protection)

/*
 * End of kprobes section
 */
	.popsection