entry_64.S 35.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 *  linux/arch/x86_64/entry.S
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 */

/*
 * entry.S contains the system-call and fault low-level handling routines.
 *
 * NOTE: This code handles signal-recognition, which happens every time
 * after an interrupt and after each system call.
14 15
 *
 * Normal syscalls and interrupts don't save a full stack frame, this is
L
Linus Torvalds 已提交
16
 * only done for syscall tracing, signals or fork/exec et.al.
17 18 19 20
 *
 * A note on terminology:
 * - top of stack: Architecture defined interrupt frame from SS to RIP
 * at the top of the kernel process stack.
L
Linus Torvalds 已提交
21
 * - partial stack frame: partially saved registers upto R11.
22
 * - full stack frame: Like partial stack frame, but all register saved.
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
 *
 * Some macro usage:
 * - CFI macros are used to generate dwarf2 unwind information for better
 * backtraces. They don't change any code.
 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
 * There are unfortunately lots of special cases where some registers
 * not touched. The macro is a big mess that should be cleaned up.
 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
 * Gives a full stack frame.
 * - ENTRY/END Define functions in the symbol table.
 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
 * frame that is otherwise undefined after a SYSCALL
 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45
 */

#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include <asm/dwarf2.h>
#include <asm/calling.h>
46
#include <asm/asm-offsets.h>
L
Linus Torvalds 已提交
47 48 49 50
#include <asm/msr.h>
#include <asm/unistd.h>
#include <asm/thread_info.h>
#include <asm/hw_irq.h>
51
#include <asm/page.h>
52
#include <asm/irqflags.h>
53
#include <asm/paravirt.h>
54
#include <asm/ftrace.h>
L
Linus Torvalds 已提交
55

R
Roland McGrath 已提交
56 57 58 59 60 61
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
#include <linux/elf-em.h>
#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
#define __AUDIT_ARCH_64BIT 0x80000000
#define __AUDIT_ARCH_LE	   0x40000000

L
Linus Torvalds 已提交
62 63
	.code64

64
#ifdef CONFIG_FUNCTION_TRACER
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
	retq
END(mcount)

ENTRY(ftrace_caller)

	/* taken from glibc */
	subq $0x38, %rsp
	movq %rax, (%rsp)
	movq %rcx, 8(%rsp)
	movq %rdx, 16(%rsp)
	movq %rsi, 24(%rsp)
	movq %rdi, 32(%rsp)
	movq %r8, 40(%rsp)
	movq %r9, 48(%rsp)

	movq 0x38(%rsp), %rdi
	movq 8(%rbp), %rsi
84
	subq $MCOUNT_INSN_SIZE, %rdi
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104

.globl ftrace_call
ftrace_call:
	call ftrace_stub

	movq 48(%rsp), %r9
	movq 40(%rsp), %r8
	movq 32(%rsp), %rdi
	movq 24(%rsp), %rsi
	movq 16(%rsp), %rdx
	movq 8(%rsp), %rcx
	movq (%rsp), %rax
	addq $0x38, %rsp

.globl ftrace_stub
ftrace_stub:
	retq
END(ftrace_caller)

#else /* ! CONFIG_DYNAMIC_FTRACE */
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
ENTRY(mcount)
	cmpq $ftrace_stub, ftrace_trace_function
	jnz trace
.globl ftrace_stub
ftrace_stub:
	retq

trace:
	/* taken from glibc */
	subq $0x38, %rsp
	movq %rax, (%rsp)
	movq %rcx, 8(%rsp)
	movq %rdx, 16(%rsp)
	movq %rsi, 24(%rsp)
	movq %rdi, 32(%rsp)
	movq %r8, 40(%rsp)
	movq %r9, 48(%rsp)

	movq 0x38(%rsp), %rdi
	movq 8(%rbp), %rsi
125
	subq $MCOUNT_INSN_SIZE, %rdi
126 127 128 129 130 131 132 133 134 135 136 137 138 139

	call   *ftrace_trace_function

	movq 48(%rsp), %r9
	movq 40(%rsp), %r8
	movq 32(%rsp), %rdi
	movq 24(%rsp), %rsi
	movq 16(%rsp), %rdx
	movq 8(%rsp), %rcx
	movq (%rsp), %rax
	addq $0x38, %rsp

	jmp ftrace_stub
END(mcount)
140
#endif /* CONFIG_DYNAMIC_FTRACE */
141
#endif /* CONFIG_FUNCTION_TRACER */
142

143
#ifndef CONFIG_PREEMPT
L
Linus Torvalds 已提交
144
#define retint_kernel retint_restore_args
145
#endif
146

147
#ifdef CONFIG_PARAVIRT
148
ENTRY(native_usergs_sysret64)
149 150 151 152
	swapgs
	sysretq
#endif /* CONFIG_PARAVIRT */

153 154 155 156 157 158 159 160 161 162

.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
#ifdef CONFIG_TRACE_IRQFLAGS
	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
	jnc  1f
	TRACE_IRQS_ON
1:
#endif
.endm

L
Linus Torvalds 已提交
163
/*
164 165
 * C code is not supposed to know about undefined top of stack. Every time
 * a C function with an pt_regs argument is called from the SYSCALL based
L
Linus Torvalds 已提交
166 167 168
 * fast path FIXUP_TOP_OF_STACK is needed.
 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
 * manipulation.
169 170 171
 */

	/* %rsp:at FRAMEEND */
L
Linus Torvalds 已提交
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
	.macro FIXUP_TOP_OF_STACK tmp
	movq	%gs:pda_oldrsp,\tmp
	movq  	\tmp,RSP(%rsp)
	movq    $__USER_DS,SS(%rsp)
	movq    $__USER_CS,CS(%rsp)
	movq 	$-1,RCX(%rsp)
	movq	R11(%rsp),\tmp  /* get eflags */
	movq	\tmp,EFLAGS(%rsp)
	.endm

	.macro RESTORE_TOP_OF_STACK tmp,offset=0
	movq   RSP-\offset(%rsp),\tmp
	movq   \tmp,%gs:pda_oldrsp
	movq   EFLAGS-\offset(%rsp),\tmp
	movq   \tmp,R11-\offset(%rsp)
	.endm

	.macro FAKE_STACK_FRAME child_rip
	/* push in order ss, rsp, eflags, cs, rip */
191
	xorl %eax, %eax
192
	pushq $__KERNEL_DS /* ss */
L
Linus Torvalds 已提交
193
	CFI_ADJUST_CFA_OFFSET	8
194
	/*CFI_REL_OFFSET	ss,0*/
L
Linus Torvalds 已提交
195 196
	pushq %rax /* rsp */
	CFI_ADJUST_CFA_OFFSET	8
197
	CFI_REL_OFFSET	rsp,0
L
Linus Torvalds 已提交
198 199
	pushq $(1<<9) /* eflags - interrupts on */
	CFI_ADJUST_CFA_OFFSET	8
200
	/*CFI_REL_OFFSET	rflags,0*/
L
Linus Torvalds 已提交
201 202
	pushq $__KERNEL_CS /* cs */
	CFI_ADJUST_CFA_OFFSET	8
203
	/*CFI_REL_OFFSET	cs,0*/
L
Linus Torvalds 已提交
204 205
	pushq \child_rip /* rip */
	CFI_ADJUST_CFA_OFFSET	8
206
	CFI_REL_OFFSET	rip,0
L
Linus Torvalds 已提交
207 208 209 210 211 212 213 214 215
	pushq	%rax /* orig rax */
	CFI_ADJUST_CFA_OFFSET	8
	.endm

	.macro UNFAKE_STACK_FRAME
	addq $8*6, %rsp
	CFI_ADJUST_CFA_OFFSET	-(6*8)
	.endm

216 217 218
	.macro	CFI_DEFAULT_STACK start=1
	.if \start
	CFI_STARTPROC	simple
219
	CFI_SIGNAL_FRAME
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
	CFI_DEF_CFA	rsp,SS+8
	.else
	CFI_DEF_CFA_OFFSET SS+8
	.endif
	CFI_REL_OFFSET	r15,R15
	CFI_REL_OFFSET	r14,R14
	CFI_REL_OFFSET	r13,R13
	CFI_REL_OFFSET	r12,R12
	CFI_REL_OFFSET	rbp,RBP
	CFI_REL_OFFSET	rbx,RBX
	CFI_REL_OFFSET	r11,R11
	CFI_REL_OFFSET	r10,R10
	CFI_REL_OFFSET	r9,R9
	CFI_REL_OFFSET	r8,R8
	CFI_REL_OFFSET	rax,RAX
	CFI_REL_OFFSET	rcx,RCX
	CFI_REL_OFFSET	rdx,RDX
	CFI_REL_OFFSET	rsi,RSI
	CFI_REL_OFFSET	rdi,RDI
	CFI_REL_OFFSET	rip,RIP
	/*CFI_REL_OFFSET	cs,CS*/
	/*CFI_REL_OFFSET	rflags,EFLAGS*/
	CFI_REL_OFFSET	rsp,RSP
	/*CFI_REL_OFFSET	ss,SS*/
L
Linus Torvalds 已提交
244
	.endm
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316

/*
 * initial frame state for interrupts and exceptions
 */
	.macro _frame ref
	CFI_STARTPROC simple
	CFI_SIGNAL_FRAME
	CFI_DEF_CFA rsp,SS+8-\ref
	/*CFI_REL_OFFSET ss,SS-\ref*/
	CFI_REL_OFFSET rsp,RSP-\ref
	/*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
	/*CFI_REL_OFFSET cs,CS-\ref*/
	CFI_REL_OFFSET rip,RIP-\ref
	.endm

/*
 * initial frame state for interrupts (and exceptions without error code)
 */
#define INTR_FRAME _frame RIP
/*
 * initial frame state for exceptions with error code (and interrupts
 * with vector already pushed)
 */
#define XCPT_FRAME _frame ORIG_RAX

/* save partial stack frame */
ENTRY(save_args)
	XCPT_FRAME
	cld
	movq  %rdi, 8*8+16(%rsp)
	CFI_REL_OFFSET rdi, 8*8+16
	movq  %rsi, 7*8+16(%rsp)
	CFI_REL_OFFSET rsi, 7*8+16
	movq  %rdx, 6*8+16(%rsp)
	CFI_REL_OFFSET rdx, 6*8+16
	movq  %rcx, 5*8+16(%rsp)
	CFI_REL_OFFSET rcx, 5*8+16
	movq  %rax, 4*8+16(%rsp)
	CFI_REL_OFFSET rax, 4*8+16
	movq  %r8, 3*8+16(%rsp)
	CFI_REL_OFFSET r8, 3*8+16
	movq  %r9, 2*8+16(%rsp)
	CFI_REL_OFFSET r9, 2*8+16
	movq  %r10, 1*8+16(%rsp)
	CFI_REL_OFFSET r10, 1*8+16
	movq  %r11, 0*8+16(%rsp)
	CFI_REL_OFFSET r11, 0*8+16
	leaq -ARGOFFSET+16(%rsp),%rdi	/* arg1 for handler */
	movq %rbp, 8(%rsp)		/* push %rbp */
	leaq 8(%rsp), %rbp		/* mov %rsp, %ebp */
	testl $3, CS(%rdi)
	je 1f
	SWAPGS
	/*
	 * irqcount is used to check if a CPU is already on an interrupt stack
	 * or not. While this is essentially redundant with preempt_count it is
	 * a little cheaper to use a separate counter in the PDA (short of
	 * moving irq_enter into assembly, which would be too much work)
	 */
1:	incl %gs:pda_irqcount
	jne 2f
	pop %rax			/* move return address... */
	mov %gs:pda_irqstackptr,%rsp
	push %rax			/* ... to the new stack */
	/*
	 * We entered an interrupt context - irqs are off:
	 */
2:	TRACE_IRQS_OFF
	ret
	CFI_ENDPROC
END(save_args)

L
Linus Torvalds 已提交
317 318
/*
 * A newly forked process directly context switches into this.
319 320
 */
/* rdi:	prev */
L
Linus Torvalds 已提交
321 322
ENTRY(ret_from_fork)
	CFI_DEFAULT_STACK
323
	push kernel_eflags(%rip)
324
	CFI_ADJUST_CFA_OFFSET 8
325
	popf				# reset kernel eflags
326
	CFI_ADJUST_CFA_OFFSET -8
L
Linus Torvalds 已提交
327 328
	call schedule_tail
	GET_THREAD_INFO(%rcx)
G
Glauber Costa 已提交
329
	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
L
Linus Torvalds 已提交
330
	jnz rff_trace
331
rff_action:
L
Linus Torvalds 已提交
332 333 334
	RESTORE_REST
	testl $3,CS-ARGOFFSET(%rsp)	# from kernel_thread?
	je   int_ret_from_sys_call
G
Glauber Costa 已提交
335
	testl $_TIF_IA32,TI_flags(%rcx)
L
Linus Torvalds 已提交
336 337 338 339 340 341
	jnz  int_ret_from_sys_call
	RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
	jmp ret_from_sys_call
rff_trace:
	movq %rsp,%rdi
	call syscall_trace_leave
342
	GET_THREAD_INFO(%rcx)
L
Linus Torvalds 已提交
343 344
	jmp rff_action
	CFI_ENDPROC
345
END(ret_from_fork)
L
Linus Torvalds 已提交
346 347 348 349 350 351 352

/*
 * System call entry. Upto 6 arguments in registers are supported.
 *
 * SYSCALL does not save anything on the stack and does not change the
 * stack pointer.
 */
353

L
Linus Torvalds 已提交
354
/*
355
 * Register setup:
L
Linus Torvalds 已提交
356 357
 * rax  system call number
 * rdi  arg0
358
 * rcx  return address for syscall/sysret, C arg3
L
Linus Torvalds 已提交
359
 * rsi  arg1
360
 * rdx  arg2
L
Linus Torvalds 已提交
361 362 363 364
 * r10  arg3 	(--> moved to rcx for C)
 * r8   arg4
 * r9   arg5
 * r11  eflags for syscall/sysret, temporary for C
365 366
 * r12-r15,rbp,rbx saved by C code, not touched.
 *
L
Linus Torvalds 已提交
367 368 369 370 371
 * Interrupts are off on entry.
 * Only called from user space.
 *
 * XXX	if we had a free scratch register we could save the RSP into the stack frame
 *      and report it properly in ps. Unfortunately we haven't.
372 373 374 375
 *
 * When user can change the frames always force IRET. That is because
 * it deals with uncanonical addresses better. SYSRET has trouble
 * with them due to bugs in both AMD and Intel CPUs.
376
 */
L
Linus Torvalds 已提交
377 378

ENTRY(system_call)
379
	CFI_STARTPROC	simple
380
	CFI_SIGNAL_FRAME
381
	CFI_DEF_CFA	rsp,PDA_STACKOFFSET
382 383
	CFI_REGISTER	rip,rcx
	/*CFI_REGISTER	rflags,r11*/
384 385 386 387 388 389 390 391
	SWAPGS_UNSAFE_STACK
	/*
	 * A hypervisor implementation might want to use a label
	 * after the swapgs, so that it can do the swapgs
	 * for the guest and jump here on syscall.
	 */
ENTRY(system_call_after_swapgs)

392
	movq	%rsp,%gs:pda_oldrsp
L
Linus Torvalds 已提交
393
	movq	%gs:pda_kernelstack,%rsp
394 395 396 397
	/*
	 * No need to follow this irqs off/on section - it's straight
	 * and short:
	 */
398
	ENABLE_INTERRUPTS(CLBR_NONE)
L
Linus Torvalds 已提交
399
	SAVE_ARGS 8,1
400
	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
401 402
	movq  %rcx,RIP-ARGOFFSET(%rsp)
	CFI_REL_OFFSET rip,RIP-ARGOFFSET
L
Linus Torvalds 已提交
403
	GET_THREAD_INFO(%rcx)
404
	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
L
Linus Torvalds 已提交
405
	jnz tracesys
R
Roland McGrath 已提交
406
system_call_fastpath:
L
Linus Torvalds 已提交
407 408 409 410 411 412 413
	cmpq $__NR_syscall_max,%rax
	ja badsys
	movq %r10,%rcx
	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
	movq %rax,RAX-ARGOFFSET(%rsp)
/*
 * Syscall return path ending with SYSRET (fast path)
414 415
 * Has incomplete stack frame and undefined top of stack.
 */
L
Linus Torvalds 已提交
416
ret_from_sys_call:
417
	movl $_TIF_ALLWORK_MASK,%edi
L
Linus Torvalds 已提交
418
	/* edi:	flagmask */
419
sysret_check:
420
	LOCKDEP_SYS_EXIT
L
Linus Torvalds 已提交
421
	GET_THREAD_INFO(%rcx)
422
	DISABLE_INTERRUPTS(CLBR_NONE)
423
	TRACE_IRQS_OFF
G
Glauber Costa 已提交
424
	movl TI_flags(%rcx),%edx
L
Linus Torvalds 已提交
425
	andl %edi,%edx
426
	jnz  sysret_careful
427
	CFI_REMEMBER_STATE
428 429 430 431
	/*
	 * sysretq will re-enable interrupts:
	 */
	TRACE_IRQS_ON
L
Linus Torvalds 已提交
432
	movq RIP-ARGOFFSET(%rsp),%rcx
433
	CFI_REGISTER	rip,rcx
L
Linus Torvalds 已提交
434
	RESTORE_ARGS 0,-ARG_SKIP,1
435
	/*CFI_REGISTER	rflags,r11*/
436
	movq	%gs:pda_oldrsp, %rsp
437
	USERGS_SYSRET64
L
Linus Torvalds 已提交
438

439
	CFI_RESTORE_STATE
L
Linus Torvalds 已提交
440
	/* Handle reschedules */
441
	/* edx:	work, edi: workmask */
L
Linus Torvalds 已提交
442 443 444
sysret_careful:
	bt $TIF_NEED_RESCHED,%edx
	jnc sysret_signal
445
	TRACE_IRQS_ON
446
	ENABLE_INTERRUPTS(CLBR_NONE)
L
Linus Torvalds 已提交
447
	pushq %rdi
448
	CFI_ADJUST_CFA_OFFSET 8
L
Linus Torvalds 已提交
449 450
	call schedule
	popq  %rdi
451
	CFI_ADJUST_CFA_OFFSET -8
L
Linus Torvalds 已提交
452 453
	jmp sysret_check

454
	/* Handle a signal */
L
Linus Torvalds 已提交
455
sysret_signal:
456
	TRACE_IRQS_ON
457
	ENABLE_INTERRUPTS(CLBR_NONE)
R
Roland McGrath 已提交
458 459 460 461
#ifdef CONFIG_AUDITSYSCALL
	bt $TIF_SYSCALL_AUDIT,%edx
	jc sysret_audit
#endif
462
	/* edx:	work flags (arg3) */
L
Linus Torvalds 已提交
463 464 465 466
	leaq do_notify_resume(%rip),%rax
	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
	xorl %esi,%esi # oldset -> arg2
	call ptregscall_common
467
	movl $_TIF_WORK_MASK,%edi
468 469
	/* Use IRET because user could have changed frame. This
	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
470
	DISABLE_INTERRUPTS(CLBR_NONE)
471
	TRACE_IRQS_OFF
472
	jmp int_with_check
473

474 475 476 477
badsys:
	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
	jmp ret_from_sys_call

R
Roland McGrath 已提交
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
#ifdef CONFIG_AUDITSYSCALL
	/*
	 * Fast path for syscall audit without full syscall trace.
	 * We just call audit_syscall_entry() directly, and then
	 * jump back to the normal fast path.
	 */
auditsys:
	movq %r10,%r9			/* 6th arg: 4th syscall arg */
	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
	movq %rax,%rsi			/* 2nd arg: syscall number */
	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
	call audit_syscall_entry
	LOAD_ARGS 0		/* reload call-clobbered registers */
	jmp system_call_fastpath

	/*
	 * Return fast path for syscall audit.  Call audit_syscall_exit()
	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
	 * masked off.
	 */
sysret_audit:
	movq %rax,%rsi		/* second arg, syscall return value */
	cmpq $0,%rax		/* is it < 0? */
	setl %al		/* 1 if so, 0 if not */
	movzbl %al,%edi		/* zero-extend that into %edi */
	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
	call audit_syscall_exit
	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
	jmp sysret_check
#endif	/* CONFIG_AUDITSYSCALL */

L
Linus Torvalds 已提交
511
	/* Do syscall tracing */
512
tracesys:
R
Roland McGrath 已提交
513 514 515 516
#ifdef CONFIG_AUDITSYSCALL
	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
	jz auditsys
#endif
L
Linus Torvalds 已提交
517
	SAVE_REST
R
Roland McGrath 已提交
518
	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
L
Linus Torvalds 已提交
519 520 521
	FIXUP_TOP_OF_STACK %rdi
	movq %rsp,%rdi
	call syscall_trace_enter
522 523 524 525 526 527
	/*
	 * Reload arg registers from stack in case ptrace changed them.
	 * We don't reload %rax because syscall_trace_enter() returned
	 * the value it wants us to use in the table lookup.
	 */
	LOAD_ARGS ARGOFFSET, 1
L
Linus Torvalds 已提交
528 529
	RESTORE_REST
	cmpq $__NR_syscall_max,%rax
R
Roland McGrath 已提交
530
	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
L
Linus Torvalds 已提交
531 532
	movq %r10,%rcx	/* fixup for C */
	call *sys_call_table(,%rax,8)
R
Roland McGrath 已提交
533
	movq %rax,RAX-ARGOFFSET(%rsp)
534
	/* Use IRET because user could have changed frame */
535 536

/*
L
Linus Torvalds 已提交
537 538
 * Syscall return path ending with IRET.
 * Has correct top of stack, but partial stack frame.
539 540
 */
	.globl int_ret_from_sys_call
541
	.globl int_with_check
542
int_ret_from_sys_call:
543
	DISABLE_INTERRUPTS(CLBR_NONE)
544
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
545 546 547 548 549
	testl $3,CS-ARGOFFSET(%rsp)
	je retint_restore_args
	movl $_TIF_ALLWORK_MASK,%edi
	/* edi:	mask to check */
int_with_check:
550
	LOCKDEP_SYS_EXIT_IRQ
L
Linus Torvalds 已提交
551
	GET_THREAD_INFO(%rcx)
G
Glauber Costa 已提交
552
	movl TI_flags(%rcx),%edx
L
Linus Torvalds 已提交
553 554
	andl %edi,%edx
	jnz   int_careful
G
Glauber Costa 已提交
555
	andl    $~TS_COMPAT,TI_status(%rcx)
L
Linus Torvalds 已提交
556 557 558 559 560 561 562 563
	jmp   retint_swapgs

	/* Either reschedule or signal or syscall exit tracking needed. */
	/* First do a reschedule test. */
	/* edx:	work, edi: workmask */
int_careful:
	bt $TIF_NEED_RESCHED,%edx
	jnc  int_very_careful
564
	TRACE_IRQS_ON
565
	ENABLE_INTERRUPTS(CLBR_NONE)
L
Linus Torvalds 已提交
566
	pushq %rdi
567
	CFI_ADJUST_CFA_OFFSET 8
L
Linus Torvalds 已提交
568 569
	call schedule
	popq %rdi
570
	CFI_ADJUST_CFA_OFFSET -8
571
	DISABLE_INTERRUPTS(CLBR_NONE)
572
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
573 574 575 576
	jmp int_with_check

	/* handle signals and tracing -- both require a full stack frame */
int_very_careful:
577
	TRACE_IRQS_ON
578
	ENABLE_INTERRUPTS(CLBR_NONE)
L
Linus Torvalds 已提交
579
	SAVE_REST
580
	/* Check for syscall exit trace */
581
	testl $_TIF_WORK_SYSCALL_EXIT,%edx
L
Linus Torvalds 已提交
582 583
	jz int_signal
	pushq %rdi
584
	CFI_ADJUST_CFA_OFFSET 8
585
	leaq 8(%rsp),%rdi	# &ptregs -> arg1
L
Linus Torvalds 已提交
586 587
	call syscall_trace_leave
	popq %rdi
588
	CFI_ADJUST_CFA_OFFSET -8
589
	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
L
Linus Torvalds 已提交
590
	jmp int_restore_rest
591

L
Linus Torvalds 已提交
592
int_signal:
P
Peter Zijlstra 已提交
593
	testl $_TIF_DO_NOTIFY_MASK,%edx
L
Linus Torvalds 已提交
594 595 596 597
	jz 1f
	movq %rsp,%rdi		# &ptregs -> arg1
	xorl %esi,%esi		# oldset -> arg2
	call do_notify_resume
R
Roland McGrath 已提交
598
1:	movl $_TIF_WORK_MASK,%edi
L
Linus Torvalds 已提交
599 600
int_restore_rest:
	RESTORE_REST
601
	DISABLE_INTERRUPTS(CLBR_NONE)
602
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
603 604
	jmp int_with_check
	CFI_ENDPROC
605
END(system_call)
606 607

/*
L
Linus Torvalds 已提交
608
 * Certain special system calls that need to save a complete full stack frame.
609 610
 */

L
Linus Torvalds 已提交
611 612 613 614 615 616
	.macro PTREGSCALL label,func,arg
	.globl \label
\label:
	leaq	\func(%rip),%rax
	leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
	jmp	ptregscall_common
617
END(\label)
L
Linus Torvalds 已提交
618 619
	.endm

620 621
	CFI_STARTPROC

L
Linus Torvalds 已提交
622 623 624 625 626 627 628 629
	PTREGSCALL stub_clone, sys_clone, %r8
	PTREGSCALL stub_fork, sys_fork, %rdi
	PTREGSCALL stub_vfork, sys_vfork, %rdi
	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
	PTREGSCALL stub_iopl, sys_iopl, %rsi

ENTRY(ptregscall_common)
	popq %r11
630 631
	CFI_ADJUST_CFA_OFFSET -8
	CFI_REGISTER rip, r11
L
Linus Torvalds 已提交
632 633
	SAVE_REST
	movq %r11, %r15
634
	CFI_REGISTER rip, r15
L
Linus Torvalds 已提交
635 636 637 638
	FIXUP_TOP_OF_STACK %r11
	call *%rax
	RESTORE_TOP_OF_STACK %r11
	movq %r15, %r11
639
	CFI_REGISTER rip, r11
L
Linus Torvalds 已提交
640 641
	RESTORE_REST
	pushq %r11
642 643
	CFI_ADJUST_CFA_OFFSET 8
	CFI_REL_OFFSET rip, 0
L
Linus Torvalds 已提交
644 645
	ret
	CFI_ENDPROC
646
END(ptregscall_common)
647

L
Linus Torvalds 已提交
648 649 650
ENTRY(stub_execve)
	CFI_STARTPROC
	popq %r11
651 652
	CFI_ADJUST_CFA_OFFSET -8
	CFI_REGISTER rip, r11
L
Linus Torvalds 已提交
653 654
	SAVE_REST
	FIXUP_TOP_OF_STACK %r11
655
	movq %rsp, %rcx
L
Linus Torvalds 已提交
656 657 658 659 660 661
	call sys_execve
	RESTORE_TOP_OF_STACK %r11
	movq %rax,RAX(%rsp)
	RESTORE_REST
	jmp int_ret_from_sys_call
	CFI_ENDPROC
662
END(stub_execve)
663

L
Linus Torvalds 已提交
664 665 666
/*
 * sigreturn is special because it needs to restore all registers on return.
 * This cannot be done with SYSRET, so use the IRET return path instead.
667
 */
L
Linus Torvalds 已提交
668 669
ENTRY(stub_rt_sigreturn)
	CFI_STARTPROC
670 671
	addq $8, %rsp
	CFI_ADJUST_CFA_OFFSET	-8
L
Linus Torvalds 已提交
672 673 674 675 676 677 678 679
	SAVE_REST
	movq %rsp,%rdi
	FIXUP_TOP_OF_STACK %r11
	call sys_rt_sigreturn
	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
	RESTORE_REST
	jmp int_ret_from_sys_call
	CFI_ENDPROC
680
END(stub_rt_sigreturn)
L
Linus Torvalds 已提交
681

682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
/*
 * Build the entry stubs and pointer table with some assembler magic.
 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
 * single cache line on all modern x86 implementations.
 */
	.section .init.rodata,"a"
ENTRY(interrupt)
	.text
	.p2align 5
	.p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
	INTR_FRAME
vector=FIRST_EXTERNAL_VECTOR
.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
	.balign 32
  .rept	7
    .if vector < NR_VECTORS
699
      .if vector <> FIRST_EXTERNAL_VECTOR
700 701 702 703
	CFI_ADJUST_CFA_OFFSET -8
      .endif
1:	pushq $(~vector+0x80)	/* Note: always in signed byte range */
	CFI_ADJUST_CFA_OFFSET 8
704
      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
	jmp 2f
      .endif
      .previous
	.quad 1b
      .text
vector=vector+1
    .endif
  .endr
2:	jmp common_interrupt
.endr
	CFI_ENDPROC
END(irq_entries_start)

.previous
END(interrupt)
.previous

722
/*
L
Linus Torvalds 已提交
723 724 725
 * Interrupt entry/exit.
 *
 * Interrupt entry points save only callee clobbered registers in fast path.
726 727 728
 *
 * Entry runs with interrupts off.
 */
L
Linus Torvalds 已提交
729

730
/* 0(%rsp): ~(interrupt number) */
L
Linus Torvalds 已提交
731
	.macro interrupt func
732 733 734
	subq $10*8, %rsp
	CFI_ADJUST_CFA_OFFSET 10*8
	call save_args
L
Linus Torvalds 已提交
735 736 737
	call \func
	.endm

738 739 740 741
	/*
	 * The interrupt stubs push (~vector+0x80) onto the stack and
	 * then jump to common_interrupt.
	 */
742 743
	.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
744
	XCPT_FRAME
745
	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
L
Linus Torvalds 已提交
746 747
	interrupt do_IRQ
	/* 0(%rsp): oldrsp-ARGOFFSET */
748
ret_from_intr:
749
	DISABLE_INTERRUPTS(CLBR_NONE)
750
	TRACE_IRQS_OFF
751
	decl %gs:pda_irqcount
752
	leaveq
753
	CFI_DEF_CFA_REGISTER	rsp
754
	CFI_ADJUST_CFA_OFFSET	-8
755
exit_intr:
L
Linus Torvalds 已提交
756 757 758
	GET_THREAD_INFO(%rcx)
	testl $3,CS-ARGOFFSET(%rsp)
	je retint_kernel
759

L
Linus Torvalds 已提交
760 761 762 763
	/* Interrupt came from user space */
	/*
	 * Has a correct top of stack, but a partial stack frame
	 * %rcx: thread info. Interrupts off.
764
	 */
L
Linus Torvalds 已提交
765 766
retint_with_reschedule:
	movl $_TIF_WORK_MASK,%edi
767
retint_check:
768
	LOCKDEP_SYS_EXIT_IRQ
G
Glauber Costa 已提交
769
	movl TI_flags(%rcx),%edx
L
Linus Torvalds 已提交
770
	andl %edi,%edx
771
	CFI_REMEMBER_STATE
L
Linus Torvalds 已提交
772
	jnz  retint_careful
773 774

retint_swapgs:		/* return to user-space */
775 776 777
	/*
	 * The iretq could re-enable interrupts:
	 */
778
	DISABLE_INTERRUPTS(CLBR_ANY)
779
	TRACE_IRQS_IRETQ
780
	SWAPGS
781 782
	jmp restore_args

783
retint_restore_args:	/* return to kernel space */
784
	DISABLE_INTERRUPTS(CLBR_ANY)
785 786 787 788 789
	/*
	 * The iretq could re-enable interrupts:
	 */
	TRACE_IRQS_IRETQ
restore_args:
I
Ingo Molnar 已提交
790 791
	RESTORE_ARGS 0,8,0

A
Adrian Bunk 已提交
792
irq_return:
793
	INTERRUPT_RETURN
I
Ingo Molnar 已提交
794 795 796 797 798 799

	.section __ex_table, "a"
	.quad irq_return, bad_iret
	.previous

#ifdef CONFIG_PARAVIRT
800
ENTRY(native_iret)
L
Linus Torvalds 已提交
801 802 803
	iretq

	.section __ex_table,"a"
804
	.quad native_iret, bad_iret
L
Linus Torvalds 已提交
805
	.previous
I
Ingo Molnar 已提交
806 807
#endif

L
Linus Torvalds 已提交
808 809
	.section .fixup,"ax"
bad_iret:
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
	/*
	 * The iret traps when the %cs or %ss being restored is bogus.
	 * We've lost the original trap vector and error code.
	 * #GPF is the most likely one to get for an invalid selector.
	 * So pretend we completed the iret and took the #GPF in user mode.
	 *
	 * We are now running with the kernel GS after exception recovery.
	 * But error_entry expects us to have user GS to match the user %cs,
	 * so swap back.
	 */
	pushq $0

	SWAPGS
	jmp general_protection

825 826
	.previous

827
	/* edi: workmask, edx: work */
L
Linus Torvalds 已提交
828
retint_careful:
829
	CFI_RESTORE_STATE
L
Linus Torvalds 已提交
830 831
	bt    $TIF_NEED_RESCHED,%edx
	jnc   retint_signal
832
	TRACE_IRQS_ON
833
	ENABLE_INTERRUPTS(CLBR_NONE)
L
Linus Torvalds 已提交
834
	pushq %rdi
835
	CFI_ADJUST_CFA_OFFSET	8
L
Linus Torvalds 已提交
836
	call  schedule
837
	popq %rdi
838
	CFI_ADJUST_CFA_OFFSET	-8
L
Linus Torvalds 已提交
839
	GET_THREAD_INFO(%rcx)
840
	DISABLE_INTERRUPTS(CLBR_NONE)
841
	TRACE_IRQS_OFF
L
Linus Torvalds 已提交
842
	jmp retint_check
843

L
Linus Torvalds 已提交
844
retint_signal:
P
Peter Zijlstra 已提交
845
	testl $_TIF_DO_NOTIFY_MASK,%edx
846
	jz    retint_swapgs
847
	TRACE_IRQS_ON
848
	ENABLE_INTERRUPTS(CLBR_NONE)
L
Linus Torvalds 已提交
849
	SAVE_REST
850
	movq $-1,ORIG_RAX(%rsp)
851
	xorl %esi,%esi		# oldset
L
Linus Torvalds 已提交
852 853 854
	movq %rsp,%rdi		# &pt_regs
	call do_notify_resume
	RESTORE_REST
855
	DISABLE_INTERRUPTS(CLBR_NONE)
856
	TRACE_IRQS_OFF
857
	GET_THREAD_INFO(%rcx)
R
Roland McGrath 已提交
858
	jmp retint_with_reschedule
L
Linus Torvalds 已提交
859 860 861 862

#ifdef CONFIG_PREEMPT
	/* Returning to kernel space. Check if we need preemption */
	/* rcx:	 threadinfo. interrupts off. */
863
ENTRY(retint_kernel)
G
Glauber Costa 已提交
864
	cmpl $0,TI_preempt_count(%rcx)
L
Linus Torvalds 已提交
865
	jnz  retint_restore_args
G
Glauber Costa 已提交
866
	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
L
Linus Torvalds 已提交
867 868 869 870 871
	jnc  retint_restore_args
	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
	jnc  retint_restore_args
	call preempt_schedule_irq
	jmp exit_intr
872
#endif
873

L
Linus Torvalds 已提交
874
	CFI_ENDPROC
875
END(common_interrupt)
876

L
Linus Torvalds 已提交
877 878
/*
 * APIC interrupts.
879
 */
880 881
	.p2align 5

L
Linus Torvalds 已提交
882
	.macro apicinterrupt num,func
883
	INTR_FRAME
884
	pushq $~(\num)
885
	CFI_ADJUST_CFA_OFFSET 8
L
Linus Torvalds 已提交
886 887 888 889 890 891 892
	interrupt \func
	jmp ret_from_intr
	CFI_ENDPROC
	.endm

ENTRY(thermal_interrupt)
	apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
893
END(thermal_interrupt)
L
Linus Torvalds 已提交
894

895 896
ENTRY(threshold_interrupt)
	apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
897
END(threshold_interrupt)
898

899
#ifdef CONFIG_SMP
L
Linus Torvalds 已提交
900 901
ENTRY(reschedule_interrupt)
	apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
902
END(reschedule_interrupt)
L
Linus Torvalds 已提交
903

904 905
	.macro INVALIDATE_ENTRY num
ENTRY(invalidate_interrupt\num)
906
	apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
907
END(invalidate_interrupt\num)
908 909 910 911 912 913 914 915 916 917
	.endm

	INVALIDATE_ENTRY 0
	INVALIDATE_ENTRY 1
	INVALIDATE_ENTRY 2
	INVALIDATE_ENTRY 3
	INVALIDATE_ENTRY 4
	INVALIDATE_ENTRY 5
	INVALIDATE_ENTRY 6
	INVALIDATE_ENTRY 7
L
Linus Torvalds 已提交
918 919 920

ENTRY(call_function_interrupt)
	apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
921
END(call_function_interrupt)
922 923 924
ENTRY(call_function_single_interrupt)
	apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
END(call_function_single_interrupt)
925 926 927
ENTRY(irq_move_cleanup_interrupt)
	apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
END(irq_move_cleanup_interrupt)
L
Linus Torvalds 已提交
928 929 930 931
#endif

ENTRY(apic_timer_interrupt)
	apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
932
END(apic_timer_interrupt)
L
Linus Torvalds 已提交
933

934 935 936 937
ENTRY(uv_bau_message_intr1)
	apicinterrupt 220,uv_bau_message_interrupt
END(uv_bau_message_intr1)

L
Linus Torvalds 已提交
938 939
ENTRY(error_interrupt)
	apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
940
END(error_interrupt)
L
Linus Torvalds 已提交
941 942 943

ENTRY(spurious_interrupt)
	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
944
END(spurious_interrupt)
945

L
Linus Torvalds 已提交
946 947
/*
 * Exception entry points.
948
 */
L
Linus Torvalds 已提交
949
	.macro zeroentry sym
950
	INTR_FRAME
951
	PARAVIRT_ADJUST_EXCEPTION_FRAME
952
	pushq $-1		/* ORIG_RAX: no syscall to restart */
953
	CFI_ADJUST_CFA_OFFSET 8
954 955 956 957 958 959 960
	subq $15*8,%rsp
	CFI_ADJUST_CFA_OFFSET 15*8
	call error_entry
	movq %rsp,%rdi		/* pt_regs pointer */
	xorl %esi,%esi		/* no error code */
	call \sym
	jmp error_exit		/* %ebx: no swapgs flag */
961
	CFI_ENDPROC
962
	.endm
L
Linus Torvalds 已提交
963 964

	.macro errorentry sym
965
	XCPT_FRAME
966
	PARAVIRT_ADJUST_EXCEPTION_FRAME
967 968 969 970 971 972 973 974
	subq $15*8,%rsp
	CFI_ADJUST_CFA_OFFSET 15*8
	call error_entry
	movq %rsp,%rdi			/* pt_regs pointer */
	movq ORIG_RAX(%rsp),%rsi	/* get error code */
	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
	call \sym
	jmp error_exit			/* %ebx: no swapgs flag */
975
	CFI_ENDPROC
L
Linus Torvalds 已提交
976 977 978 979
	.endm

	/* error code is on the stack already */
	/* handle NMI like exceptions that can happen everywhere */
980
	.macro paranoidentry sym, ist=0, irqtrace=1
L
Linus Torvalds 已提交
981 982 983 984 985 986 987
	SAVE_ALL
	cld
	movl $1,%ebx
	movl  $MSR_GS_BASE,%ecx
	rdmsr
	testl %edx,%edx
	js    1f
988
	SWAPGS
L
Linus Torvalds 已提交
989
	xorl  %ebx,%ebx
990 991 992 993
1:
	.if \ist
	movq	%gs:pda_data_offset, %rbp
	.endif
994 995 996
	.if \irqtrace
	TRACE_IRQS_OFF
	.endif
997
	movq %rsp,%rdi
L
Linus Torvalds 已提交
998 999
	movq ORIG_RAX(%rsp),%rsi
	movq $-1,ORIG_RAX(%rsp)
1000
	.if \ist
1001
	subq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1002
	.endif
L
Linus Torvalds 已提交
1003
	call \sym
1004
	.if \ist
1005
	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1006
	.endif
1007
	DISABLE_INTERRUPTS(CLBR_NONE)
1008 1009 1010
	.if \irqtrace
	TRACE_IRQS_OFF
	.endif
L
Linus Torvalds 已提交
1011
	.endm
1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032

	/*
 	 * "Paranoid" exit path from exception stack.
  	 * Paranoid because this is used by NMIs and cannot take
	 * any kernel state for granted.
	 * We don't do kernel preemption checks here, because only
	 * NMI should be common and it does not enable IRQs and
	 * cannot get reschedule ticks.
	 *
	 * "trace" is 0 for the NMI handler only, because irq-tracing
	 * is fundamentally NMI-unsafe. (we cannot change the soft and
	 * hard flags at once, atomically)
	 */
	.macro paranoidexit trace=1
	/* ebx:	no swapgs flag */
paranoid_exit\trace:
	testl %ebx,%ebx				/* swapgs needed? */
	jnz paranoid_restore\trace
	testl $3,CS(%rsp)
	jnz   paranoid_userspace\trace
paranoid_swapgs\trace:
1033
	.if \trace
1034
	TRACE_IRQS_IRETQ 0
1035
	.endif
1036
	SWAPGS_UNSAFE_STACK
1037 1038
paranoid_restore\trace:
	RESTORE_ALL 8
I
Ingo Molnar 已提交
1039
	jmp irq_return
1040 1041
paranoid_userspace\trace:
	GET_THREAD_INFO(%rcx)
G
Glauber Costa 已提交
1042
	movl TI_flags(%rcx),%ebx
1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
	andl $_TIF_WORK_MASK,%ebx
	jz paranoid_swapgs\trace
	movq %rsp,%rdi			/* &pt_regs */
	call sync_regs
	movq %rax,%rsp			/* switch stack for scheduling */
	testl $_TIF_NEED_RESCHED,%ebx
	jnz paranoid_schedule\trace
	movl %ebx,%edx			/* arg3: thread flags */
	.if \trace
	TRACE_IRQS_ON
	.endif
1054
	ENABLE_INTERRUPTS(CLBR_NONE)
1055 1056 1057
	xorl %esi,%esi 			/* arg2: oldset */
	movq %rsp,%rdi 			/* arg1: &pt_regs */
	call do_notify_resume
1058
	DISABLE_INTERRUPTS(CLBR_NONE)
1059 1060 1061 1062 1063 1064 1065 1066
	.if \trace
	TRACE_IRQS_OFF
	.endif
	jmp paranoid_userspace\trace
paranoid_schedule\trace:
	.if \trace
	TRACE_IRQS_ON
	.endif
1067
	ENABLE_INTERRUPTS(CLBR_ANY)
1068
	call schedule
1069
	DISABLE_INTERRUPTS(CLBR_ANY)
1070 1071 1072 1073 1074 1075 1076
	.if \trace
	TRACE_IRQS_OFF
	.endif
	jmp paranoid_userspace\trace
	CFI_ENDPROC
	.endm

L
Linus Torvalds 已提交
1077
/*
1078 1079
 * Exception entry point. This expects an error code/orig_rax on the stack.
 * returns in "no swapgs flag" in %ebx.
1080
 */
1081
KPROBE_ENTRY(error_entry)
1082
	_frame RDI
1083 1084
	CFI_ADJUST_CFA_OFFSET 15*8
	/* oldrax contains error code */
1085
	cld
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
	movq %rdi,14*8+8(%rsp)
	CFI_REL_OFFSET rdi,RDI+8
	movq %rsi,13*8+8(%rsp)
	CFI_REL_OFFSET rsi,RSI+8
	movq %rdx,12*8+8(%rsp)
	CFI_REL_OFFSET rdx,RDX+8
	movq %rcx,11*8+8(%rsp)
	CFI_REL_OFFSET rcx,RCX+8
	movq %rax,10*8+8(%rsp)
	CFI_REL_OFFSET rax,RAX+8
	movq %r8, 9*8+8(%rsp)
	CFI_REL_OFFSET r8,R8+8
	movq %r9, 8*8+8(%rsp)
	CFI_REL_OFFSET r9,R9+8
	movq %r10,7*8+8(%rsp)
	CFI_REL_OFFSET r10,R10+8
	movq %r11,6*8+8(%rsp)
	CFI_REL_OFFSET r11,R11+8
	movq %rbx,5*8+8(%rsp)
	CFI_REL_OFFSET rbx,RBX+8
	movq %rbp,4*8+8(%rsp)
	CFI_REL_OFFSET rbp,RBP+8
	movq %r12,3*8+8(%rsp)
	CFI_REL_OFFSET r12,R12+8
	movq %r13,2*8+8(%rsp)
	CFI_REL_OFFSET r13,R13+8
	movq %r14,1*8+8(%rsp)
	CFI_REL_OFFSET r14,R14+8
	movq %r15,0*8+8(%rsp)
	CFI_REL_OFFSET r15,R15+8
1116
	xorl %ebx,%ebx
1117 1118
	testl $3,CS+8(%rsp)
	je error_kernelspace
1119
error_swapgs:
1120
	SWAPGS
1121 1122
error_sti:
	TRACE_IRQS_OFF
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
	ret
	CFI_ENDPROC

/*
 * There are two places in the kernel that can potentially fault with
 * usergs. Handle them here. The exception handlers after iret run with
 * kernel gs again, so don't set the user space flag. B stepping K8s
 * sometimes report an truncated RIP for IRET exceptions returning to
 * compat mode. Check for these here too.
 */
error_kernelspace:
	incl %ebx
	leaq irq_return(%rip),%rcx
	cmpq %rcx,RIP+8(%rsp)
	je error_swapgs
	movl %ecx,%ecx	/* zero extend */
	cmpq %rcx,RIP+8(%rsp)
	je error_swapgs
	cmpq $gs_change,RIP+8(%rsp)
        je error_swapgs
	jmp error_sti
KPROBE_END(error_entry)


/* ebx:	no swapgs flag (1: don't need swapgs, 0: need it) */
KPROBE_ENTRY(error_exit)
	_frame R15
1150
	movl %ebx,%eax
L
Linus Torvalds 已提交
1151
	RESTORE_REST
1152
	DISABLE_INTERRUPTS(CLBR_NONE)
1153
	TRACE_IRQS_OFF
1154
	GET_THREAD_INFO(%rcx)
L
Linus Torvalds 已提交
1155
	testl %eax,%eax
1156
	jne retint_kernel
1157
	LOCKDEP_SYS_EXIT_IRQ
1158 1159 1160 1161
	movl TI_flags(%rcx),%edx
	movl $_TIF_WORK_MASK,%edi
	andl %edi,%edx
	jnz retint_careful
1162
	jmp retint_swapgs
L
Linus Torvalds 已提交
1163
	CFI_ENDPROC
1164
KPROBE_END(error_exit)
1165

L
Linus Torvalds 已提交
1166
       /* Reload gs selector with exception handling */
1167
       /* edi:  new selector */
1168
ENTRY(native_load_gs_index)
1169
	CFI_STARTPROC
L
Linus Torvalds 已提交
1170
	pushf
1171
	CFI_ADJUST_CFA_OFFSET 8
1172 1173
	DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
        SWAPGS
1174 1175
gs_change:
        movl %edi,%gs
L
Linus Torvalds 已提交
1176
2:	mfence		/* workaround */
1177
	SWAPGS
L
Linus Torvalds 已提交
1178
        popf
1179
	CFI_ADJUST_CFA_OFFSET -8
L
Linus Torvalds 已提交
1180
        ret
1181
	CFI_ENDPROC
1182
ENDPROC(native_load_gs_index)
1183

L
Linus Torvalds 已提交
1184 1185 1186 1187 1188 1189
        .section __ex_table,"a"
        .align 8
        .quad gs_change,bad_gs
        .previous
        .section .fixup,"ax"
	/* running with kernelgs */
1190
bad_gs:
1191
	SWAPGS			/* switch back to user gs */
L
Linus Torvalds 已提交
1192 1193 1194
	xorl %eax,%eax
        movl %eax,%gs
        jmp  2b
1195 1196
        .previous

L
Linus Torvalds 已提交
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
/*
 * Create a kernel thread.
 *
 * C extern interface:
 *	extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 *
 * asm input arguments:
 *	rdi: fn, rsi: arg, rdx: flags
 */
ENTRY(kernel_thread)
	CFI_STARTPROC
	FAKE_STACK_FRAME $child_rip
	SAVE_ALL

	# rdi: flags, rsi: usp, rdx: will be &pt_regs
	movq %rdx,%rdi
	orq  kernel_thread_flags(%rip),%rdi
	movq $-1, %rsi
	movq %rsp, %rdx

	xorl %r8d,%r8d
	xorl %r9d,%r9d
1219

L
Linus Torvalds 已提交
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229
	# clone now
	call do_fork
	movq %rax,RAX(%rsp)
	xorl %edi,%edi

	/*
	 * It isn't worth to check for reschedule here,
	 * so internally to the x86_64 port you can rely on kernel_thread()
	 * not to reschedule the child before returning, this avoids the need
	 * of hacks for example to fork off the per-CPU idle tasks.
1230
         * [Hopefully no generic code relies on the reschedule -AK]
L
Linus Torvalds 已提交
1231 1232 1233 1234 1235
	 */
	RESTORE_ALL
	UNFAKE_STACK_FRAME
	ret
	CFI_ENDPROC
1236
ENDPROC(kernel_thread)
1237

L
Linus Torvalds 已提交
1238
child_rip:
1239 1240
	pushq $0		# fake return address
	CFI_STARTPROC
L
Linus Torvalds 已提交
1241 1242 1243 1244 1245 1246 1247 1248
	/*
	 * Here we are in the child and the registers are set as they were
	 * at kernel_thread() invocation in the parent.
	 */
	movq %rdi, %rax
	movq %rsi, %rdi
	call *%rax
	# exit
1249
	mov %eax, %edi
L
Linus Torvalds 已提交
1250
	call do_exit
1251
	CFI_ENDPROC
1252
ENDPROC(child_rip)
L
Linus Torvalds 已提交
1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263

/*
 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
 *
 * C extern interface:
 *	 extern long execve(char *name, char **argv, char **envp)
 *
 * asm input arguments:
 *	rdi: name, rsi: argv, rdx: envp
 *
 * We want to fallback into:
1264
 *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
L
Linus Torvalds 已提交
1265 1266
 *
 * do_sys_execve asm fallback arguments:
1267
 *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
L
Linus Torvalds 已提交
1268
 */
1269
ENTRY(kernel_execve)
L
Linus Torvalds 已提交
1270 1271
	CFI_STARTPROC
	FAKE_STACK_FRAME $0
1272
	SAVE_ALL
1273
	movq %rsp,%rcx
L
Linus Torvalds 已提交
1274
	call sys_execve
1275
	movq %rax, RAX(%rsp)
L
Linus Torvalds 已提交
1276 1277 1278 1279 1280 1281 1282
	RESTORE_REST
	testq %rax,%rax
	je int_ret_from_sys_call
	RESTORE_ARGS
	UNFAKE_STACK_FRAME
	ret
	CFI_ENDPROC
1283
ENDPROC(kernel_execve)
L
Linus Torvalds 已提交
1284

1285
KPROBE_ENTRY(page_fault)
L
Linus Torvalds 已提交
1286
	errorentry do_page_fault
1287
KPROBE_END(page_fault)
L
Linus Torvalds 已提交
1288 1289 1290

ENTRY(coprocessor_error)
	zeroentry do_coprocessor_error
1291
END(coprocessor_error)
L
Linus Torvalds 已提交
1292 1293

ENTRY(simd_coprocessor_error)
1294
	zeroentry do_simd_coprocessor_error
1295
END(simd_coprocessor_error)
L
Linus Torvalds 已提交
1296 1297

ENTRY(device_not_available)
1298
	zeroentry do_device_not_available
1299
END(device_not_available)
L
Linus Torvalds 已提交
1300 1301

	/* runs on exception stack */
1302
KPROBE_ENTRY(debug)
1303
 	INTR_FRAME
1304
	PARAVIRT_ADJUST_EXCEPTION_FRAME
L
Linus Torvalds 已提交
1305
	pushq $0
1306
	CFI_ADJUST_CFA_OFFSET 8
1307
	paranoidentry do_debug, DEBUG_STACK
1308
	paranoidexit
1309
KPROBE_END(debug)
L
Linus Torvalds 已提交
1310

1311
	/* runs on exception stack */
1312
KPROBE_ENTRY(nmi)
1313
	INTR_FRAME
1314
	PARAVIRT_ADJUST_EXCEPTION_FRAME
L
Linus Torvalds 已提交
1315
	pushq $-1
1316
	CFI_ADJUST_CFA_OFFSET 8
1317 1318 1319 1320 1321 1322 1323
	paranoidentry do_nmi, 0, 0
#ifdef CONFIG_TRACE_IRQFLAGS
	paranoidexit 0
#else
	jmp paranoid_exit1
 	CFI_ENDPROC
#endif
1324
KPROBE_END(nmi)
1325

1326
KPROBE_ENTRY(int3)
1327
 	INTR_FRAME
1328
	PARAVIRT_ADJUST_EXCEPTION_FRAME
1329 1330
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8
1331
 	paranoidentry do_int3, DEBUG_STACK
1332
 	jmp paranoid_exit1
1333
 	CFI_ENDPROC
1334
KPROBE_END(int3)
L
Linus Torvalds 已提交
1335 1336 1337

ENTRY(overflow)
	zeroentry do_overflow
1338
END(overflow)
L
Linus Torvalds 已提交
1339 1340 1341

ENTRY(bounds)
	zeroentry do_bounds
1342
END(bounds)
L
Linus Torvalds 已提交
1343 1344

ENTRY(invalid_op)
1345
	zeroentry do_invalid_op
1346
END(invalid_op)
L
Linus Torvalds 已提交
1347 1348 1349

ENTRY(coprocessor_segment_overrun)
	zeroentry do_coprocessor_segment_overrun
1350
END(coprocessor_segment_overrun)
L
Linus Torvalds 已提交
1351 1352 1353

	/* runs on exception stack */
ENTRY(double_fault)
1354
	XCPT_FRAME
1355
	PARAVIRT_ADJUST_EXCEPTION_FRAME
L
Linus Torvalds 已提交
1356
	paranoidentry do_double_fault
1357
	jmp paranoid_exit1
L
Linus Torvalds 已提交
1358
	CFI_ENDPROC
1359
END(double_fault)
L
Linus Torvalds 已提交
1360 1361 1362

ENTRY(invalid_TSS)
	errorentry do_invalid_TSS
1363
END(invalid_TSS)
L
Linus Torvalds 已提交
1364 1365 1366

ENTRY(segment_not_present)
	errorentry do_segment_not_present
1367
END(segment_not_present)
L
Linus Torvalds 已提交
1368 1369 1370

	/* runs on exception stack */
ENTRY(stack_segment)
1371
	XCPT_FRAME
1372
	PARAVIRT_ADJUST_EXCEPTION_FRAME
L
Linus Torvalds 已提交
1373
	paranoidentry do_stack_segment
1374
	jmp paranoid_exit1
L
Linus Torvalds 已提交
1375
	CFI_ENDPROC
1376
END(stack_segment)
L
Linus Torvalds 已提交
1377

1378
KPROBE_ENTRY(general_protection)
L
Linus Torvalds 已提交
1379
	errorentry do_general_protection
1380
KPROBE_END(general_protection)
L
Linus Torvalds 已提交
1381 1382 1383

ENTRY(alignment_check)
	errorentry do_alignment_check
1384
END(alignment_check)
L
Linus Torvalds 已提交
1385 1386 1387

ENTRY(divide_error)
	zeroentry do_divide_error
1388
END(divide_error)
L
Linus Torvalds 已提交
1389 1390 1391

ENTRY(spurious_interrupt_bug)
	zeroentry do_spurious_interrupt_bug
1392
END(spurious_interrupt_bug)
L
Linus Torvalds 已提交
1393 1394 1395 1396

#ifdef CONFIG_X86_MCE
	/* runs on exception stack */
ENTRY(machine_check)
1397
	INTR_FRAME
1398
	PARAVIRT_ADJUST_EXCEPTION_FRAME
L
Linus Torvalds 已提交
1399
	pushq $0
1400
	CFI_ADJUST_CFA_OFFSET 8
L
Linus Torvalds 已提交
1401
	paranoidentry do_machine_check
1402
	jmp paranoid_exit1
L
Linus Torvalds 已提交
1403
	CFI_ENDPROC
1404
END(machine_check)
L
Linus Torvalds 已提交
1405 1406
#endif

1407
/* Call softirq on interrupt stack. Interrupts are off. */
1408
ENTRY(call_softirq)
1409
	CFI_STARTPROC
1410 1411 1412 1413 1414
	push %rbp
	CFI_ADJUST_CFA_OFFSET	8
	CFI_REL_OFFSET rbp,0
	mov  %rsp,%rbp
	CFI_DEF_CFA_REGISTER rbp
1415
	incl %gs:pda_irqcount
1416 1417
	cmove %gs:pda_irqstackptr,%rsp
	push  %rbp			# backlink for old unwinder
1418
	call __do_softirq
1419
	leaveq
1420
	CFI_DEF_CFA_REGISTER	rsp
1421
	CFI_ADJUST_CFA_OFFSET   -8
1422 1423
	decl %gs:pda_irqcount
	ret
1424
	CFI_ENDPROC
1425
ENDPROC(call_softirq)
1426 1427 1428 1429 1430 1431 1432

KPROBE_ENTRY(ignore_sysret)
	CFI_STARTPROC
	mov $-ENOSYS,%eax
	sysret
	CFI_ENDPROC
ENDPROC(ignore_sysret)
1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485

#ifdef CONFIG_XEN
ENTRY(xen_hypervisor_callback)
	zeroentry xen_do_hypervisor_callback
END(xen_hypervisor_callback)

/*
# A note on the "critical region" in our callback handler.
# We want to avoid stacking callback handlers due to events occurring
# during handling of the last event. To do this, we keep events disabled
# until we've done all processing. HOWEVER, we must enable events before
# popping the stack frame (can't be done atomically) and so it would still
# be possible to get enough handler activations to overflow the stack.
# Although unlikely, bugs of that kind are hard to track down, so we'd
# like to avoid the possibility.
# So, on entry to the handler we detect whether we interrupted an
# existing activation in its critical region -- if so, we pop the current
# activation and restart the handler using the previous one.
*/
ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
	CFI_STARTPROC
/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
   see the correct pointer to the pt_regs */
	movq %rdi, %rsp            # we don't return, adjust the stack frame
	CFI_ENDPROC
	CFI_DEFAULT_STACK
11:	incl %gs:pda_irqcount
	movq %rsp,%rbp
	CFI_DEF_CFA_REGISTER rbp
	cmovzq %gs:pda_irqstackptr,%rsp
	pushq %rbp			# backlink for old unwinder
	call xen_evtchn_do_upcall
	popq %rsp
	CFI_DEF_CFA_REGISTER rsp
	decl %gs:pda_irqcount
	jmp  error_exit
	CFI_ENDPROC
END(do_hypervisor_callback)

/*
# Hypervisor uses this for application faults while it executes.
# We get here for two reasons:
#  1. Fault while reloading DS, ES, FS or GS
#  2. Fault while executing IRET
# Category 1 we do not need to fix up as Xen has already reloaded all segment
# registers that could be reloaded and zeroed the others.
# Category 2 we fix up by killing the current process. We cannot use the
# normal Linux return path in this case because if we use the IRET hypercall
# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
# We distinguish between categories by comparing each saved segment register
# with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
1486 1487
	framesz = (RIP-0x30)	/* workaround buggy gas */
	_frame framesz
1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509
	CFI_REL_OFFSET rcx, 0
	CFI_REL_OFFSET r11, 8
	movw %ds,%cx
	cmpw %cx,0x10(%rsp)
	CFI_REMEMBER_STATE
	jne 1f
	movw %es,%cx
	cmpw %cx,0x18(%rsp)
	jne 1f
	movw %fs,%cx
	cmpw %cx,0x20(%rsp)
	jne 1f
	movw %gs,%cx
	cmpw %cx,0x28(%rsp)
	jne 1f
	/* All segments match their saved values => Category 2 (Bad IRET). */
	movq (%rsp),%rcx
	CFI_RESTORE rcx
	movq 8(%rsp),%r11
	CFI_RESTORE r11
	addq $0x30,%rsp
	CFI_ADJUST_CFA_OFFSET -0x30
1510 1511 1512 1513 1514 1515 1516
	pushq $0
	CFI_ADJUST_CFA_OFFSET 8
	pushq %r11
	CFI_ADJUST_CFA_OFFSET 8
	pushq %rcx
	CFI_ADJUST_CFA_OFFSET 8
	jmp general_protection
1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532
	CFI_RESTORE_STATE
1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
	movq (%rsp),%rcx
	CFI_RESTORE rcx
	movq 8(%rsp),%r11
	CFI_RESTORE r11
	addq $0x30,%rsp
	CFI_ADJUST_CFA_OFFSET -0x30
	pushq $0
	CFI_ADJUST_CFA_OFFSET 8
	SAVE_ALL
	jmp error_exit
	CFI_ENDPROC
END(xen_failsafe_callback)

#endif /* CONFIG_XEN */