calling.h 10.8 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
2
#include <linux/jump_label.h>
3
#include <asm/unwind_hints.h>
4 5
#include <asm/cpufeatures.h>
#include <asm/page_types.h>
6 7 8
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
9

10
/*
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57

 x86 function call convention, 64-bit:
 -------------------------------------
  arguments           |  callee-saved      | extra caller-saved | return
 [callee-clobbered]   |                    | [callee-clobbered] |
 ---------------------------------------------------------------------------
 rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11             | rax, rdx [**]

 ( rsp is obviously invariant across normal function calls. (gcc can 'merge'
   functions when it sees tail-call optimization possibilities) rflags is
   clobbered. Leftover arguments are passed over the stack frame.)

 [*]  In the frame-pointers case rbp is fixed to the stack frame.

 [**] for struct return values wider than 64 bits the return convention is a
      bit more complex: up to 128 bits width we return small structures
      straight in rax, rdx. For structures larger than that (3 words or
      larger) the caller puts a pointer to an on-stack return struct
      [allocated in the caller's stack frame] into the first argument - i.e.
      into rdi. All other arguments shift up by one in this case.
      Fortunately this case is rare in the kernel.

For 32-bit we have the following conventions - kernel is built with
-mregparm=3 and -freg-struct-return:

 x86 function calling convention, 32-bit:
 ----------------------------------------
  arguments         | callee-saved        | extra caller-saved | return
 [callee-clobbered] |                     | [callee-clobbered] |
 -------------------------------------------------------------------------
 eax edx ecx        | ebx edi esi ebp [*] | <none>             | eax, edx [**]

 ( here too esp is obviously invariant across normal function calls. eflags
   is clobbered. Leftover arguments are passed over the stack frame. )

 [*]  In the frame-pointers case ebp is fixed to the stack frame.

 [**] We build with -freg-struct-return, which on 32-bit means similar
      semantics as on 64-bit: edx can be used for a second return value
      (i.e. covering integer and structure sizes up to 64 bits) - after that
      it gets more complex and more expensive: 3-word or larger struct returns
      get done in the caller's frame and the pointer to the return struct goes
      into regparm0, i.e. eax - the other arguments shift up and the
      function's register parameters degenerate to regparm=2 in essence.

*/

58 59
#ifdef CONFIG_X86_64

60
/*
T
Tao Guo 已提交
61 62
 * 64-bit system call stack frame layout defines and helpers,
 * for assembly code:
63
 */
L
Linus Torvalds 已提交
64

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
/* The layout forms the "struct pt_regs" on the stack: */
/*
 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
 * unless syscall needs a complete, fully filled "struct pt_regs".
 */
#define R15		0*8
#define R14		1*8
#define R13		2*8
#define R12		3*8
#define RBP		4*8
#define RBX		5*8
/* These regs are callee-clobbered. Always saved on kernel entry. */
#define R11		6*8
#define R10		7*8
#define R9		8*8
#define R8		9*8
#define RAX		10*8
#define RCX		11*8
#define RDX		12*8
#define RSI		13*8
#define RDI		14*8
/*
 * On syscall entry, this is syscall#. On CPU exception, this is error code.
 * On hw interrupt, it's IRQ number:
 */
#define ORIG_RAX	15*8
/* Return frame for iretq */
#define RIP		16*8
#define CS		17*8
#define EFLAGS		18*8
#define RSP		19*8
#define SS		20*8

98 99
#define SIZEOF_PTREGS	21*8

100 101
	.macro ALLOC_PT_GPREGS_ON_STACK
	addq	$-(15*8), %rsp
102
	.endm
103

104 105 106 107 108 109 110 111
	.macro SAVE_AND_CLEAR_REGS offset=0
	/*
	 * Save registers and sanitize registers of values that a
	 * speculation attack might otherwise want to exploit. The
	 * lower registers are likely clobbered well before they
	 * could be put to use in a speculative execution gadget.
	 * Interleave XOR with PUSH for better uop scheduling:
	 */
112
	movq %rdi, 14*8+\offset(%rsp)
113 114 115 116 117
	movq %rsi, 13*8+\offset(%rsp)
	movq %rdx, 12*8+\offset(%rsp)
	movq %rcx, 11*8+\offset(%rsp)
	movq %rax, 10*8+\offset(%rsp)
	movq %r8,  9*8+\offset(%rsp)
118
	xorq %r8, %r8				/* nospec r8 */
119
	movq %r9,  8*8+\offset(%rsp)
120
	xorq %r9, %r9				/* nospec r9 */
121
	movq %r10, 7*8+\offset(%rsp)
122
	xorq %r10, %r10				/* nospec r10 */
123
	movq %r11, 6*8+\offset(%rsp)
124
	xorq %r11, %r11				/* nospec r11 */
125
	movq %rbx, 5*8+\offset(%rsp)
126
	xorl %ebx, %ebx				/* nospec rbx */
127
	movq %rbp, 4*8+\offset(%rsp)
128
	xorl %ebp, %ebp				/* nospec rbp */
129
	movq %r12, 3*8+\offset(%rsp)
130
	xorq %r12, %r12				/* nospec r12 */
131
	movq %r13, 2*8+\offset(%rsp)
132
	xorq %r13, %r13				/* nospec r13 */
133
	movq %r14, 1*8+\offset(%rsp)
134
	xorq %r14, %r14				/* nospec r14 */
135
	movq %r15, 0*8+\offset(%rsp)
136
	xorq %r15, %r15				/* nospec r15 */
137
	UNWIND_HINT_REGS offset=\offset
138
	.endm
L
Linus Torvalds 已提交
139

140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
	.macro PUSH_AND_CLEAR_REGS
	/*
	 * Push registers and sanitize registers of values that a
	 * speculation attack might otherwise want to exploit. The
	 * lower registers are likely clobbered well before they
	 * could be put to use in a speculative execution gadget.
	 * Interleave XOR with PUSH for better uop scheduling:
	 */
	pushq   %rdi		/* pt_regs->di */
	pushq   %rsi		/* pt_regs->si */
	pushq   %rdx		/* pt_regs->dx */
	pushq   %rcx		/* pt_regs->cx */
	pushq   %rax		/* pt_regs->ax */
	pushq   %r8		/* pt_regs->r8 */
	xorq    %r8, %r8	/* nospec   r8 */
	pushq   %r9		/* pt_regs->r9 */
	xorq    %r9, %r9	/* nospec   r9 */
	pushq   %r10		/* pt_regs->r10 */
	xorq    %r10, %r10	/* nospec   r10 */
	pushq   %r11		/* pt_regs->r11 */
	xorq    %r11, %r11	/* nospec   r11*/
	pushq	%rbx		/* pt_regs->rbx */
	xorl    %ebx, %ebx	/* nospec   rbx*/
	pushq	%rbp		/* pt_regs->rbp */
	xorl    %ebp, %ebp	/* nospec   rbp*/
	pushq	%r12		/* pt_regs->r12 */
	xorq    %r12, %r12	/* nospec   r12*/
	pushq	%r13		/* pt_regs->r13 */
	xorq    %r13, %r13	/* nospec   r13*/
	pushq	%r14		/* pt_regs->r14 */
	xorq    %r14, %r14	/* nospec   r14*/
	pushq	%r15		/* pt_regs->r15 */
	xorq    %r15, %r15	/* nospec   r15*/
	UNWIND_HINT_REGS
	.endm

176
	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
177 178 179 180 181 182
	popq %r15
	popq %r14
	popq %r13
	popq %r12
	popq %rbp
	popq %rbx
183 184 185
	.if \skip_r11rcx
	popq %rsi
	.else
186
	popq %r11
187
	.endif
188 189 190 191
	popq %r10
	popq %r9
	popq %r8
	popq %rax
192 193 194
	.if \skip_r11rcx
	popq %rsi
	.else
195
	popq %rcx
196
	.endif
197 198
	popq %rdx
	popq %rsi
199
	.if \pop_rdi
200
	popq %rdi
201
	.endif
L
Linus Torvalds 已提交
202 203 204 205 206
	.endm

	.macro icebp
	.byte 0xf1
	.endm
207

208 209 210 211 212 213
/*
 * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
 * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
 * is just setting the LSB, which makes it an invalid stack address and is also
 * a signal to the unwinder that it's a pt_regs pointer in disguise.
 *
214
 * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
215 216 217 218 219 220 221 222 223 224 225 226 227
 * the original rbp.
 */
.macro ENCODE_FRAME_POINTER ptregs_offset=0
#ifdef CONFIG_FRAME_POINTER
	.if \ptregs_offset
		leaq \ptregs_offset(%rsp), %rbp
	.else
		mov %rsp, %rbp
	.endif
	orq	$0x1, %rbp
#endif
.endm

228 229
#ifdef CONFIG_PAGE_TABLE_ISOLATION

230 231 232 233
/*
 * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
 * halves:
 */
234 235 236 237 238
#define PTI_USER_PGTABLE_BIT		PAGE_SHIFT
#define PTI_USER_PGTABLE_MASK		(1 << PTI_USER_PGTABLE_BIT)
#define PTI_USER_PCID_BIT		X86_CR3_PTI_PCID_USER_BIT
#define PTI_USER_PCID_MASK		(1 << PTI_USER_PCID_BIT)
#define PTI_USER_PGTABLE_AND_PCID_MASK  (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
239

240 241
.macro SET_NOFLUSH_BIT	reg:req
	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
242 243
.endm

244 245 246
.macro ADJUST_KERNEL_CR3 reg:req
	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
247
	andq    $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
248 249 250
.endm

.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
251
	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
252 253 254
	mov	%cr3, \scratch_reg
	ADJUST_KERNEL_CR3 \scratch_reg
	mov	\scratch_reg, %cr3
255
.Lend_\@:
256 257
.endm

258 259 260 261
#define THIS_CPU_user_pcid_flush_mask   \
	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask

.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
262
	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
263
	mov	%cr3, \scratch_reg
264 265 266 267 268 269 270 271 272 273 274 275 276 277

	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID

	/*
	 * Test if the ASID needs a flush.
	 */
	movq	\scratch_reg, \scratch_reg2
	andq	$(0x7FF), \scratch_reg		/* mask ASID */
	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
	jnc	.Lnoflush_\@

	/* Flush needed, clear the bit */
	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
	movq	\scratch_reg2, \scratch_reg
278
	jmp	.Lwrcr3_pcid_\@
279 280 281 282 283

.Lnoflush_\@:
	movq	\scratch_reg2, \scratch_reg
	SET_NOFLUSH_BIT \scratch_reg

284 285 286 287
.Lwrcr3_pcid_\@:
	/* Flip the ASID to the user version */
	orq	$(PTI_USER_PCID_MASK), \scratch_reg

288
.Lwrcr3_\@:
289 290
	/* Flip the PGD to the user version */
	orq     $(PTI_USER_PGTABLE_MASK), \scratch_reg
291
	mov	\scratch_reg, %cr3
292
.Lend_\@:
293 294
.endm

295 296 297 298 299 300
.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req
	pushq	%rax
	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
	popq	%rax
.endm

301
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
302
	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
303 304 305
	movq	%cr3, \scratch_reg
	movq	\scratch_reg, \save_reg
	/*
306 307 308
	 * Test the user pagetable bit. If set, then the user page tables
	 * are active. If clear CR3 already has the kernel page table
	 * active.
309
	 */
310 311
	bt	$PTI_USER_PGTABLE_BIT, \scratch_reg
	jnc	.Ldone_\@
312 313 314 315 316 317 318

	ADJUST_KERNEL_CR3 \scratch_reg
	movq	\scratch_reg, %cr3

.Ldone_\@:
.endm

P
Peter Zijlstra 已提交
319
.macro RESTORE_CR3 scratch_reg:req save_reg:req
320
	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
P
Peter Zijlstra 已提交
321 322 323 324 325 326 327

	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID

	/*
	 * KERNEL pages can always resume with NOFLUSH as we do
	 * explicit flushes.
	 */
328
	bt	$PTI_USER_PGTABLE_BIT, \save_reg
P
Peter Zijlstra 已提交
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
	jnc	.Lnoflush_\@

	/*
	 * Check if there's a pending flush for the user ASID we're
	 * about to set.
	 */
	movq	\save_reg, \scratch_reg
	andq	$(0x7FF), \scratch_reg
	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
	jnc	.Lnoflush_\@

	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
	jmp	.Lwrcr3_\@

.Lnoflush_\@:
	SET_NOFLUSH_BIT \save_reg

.Lwrcr3_\@:
347 348 349 350 351
	/*
	 * The CR3 write could be avoided when not changing its value,
	 * but would require a CR3 read *and* a scratch register.
	 */
	movq	\save_reg, %cr3
352
.Lend_\@:
353 354 355 356 357 358
.endm

#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */

.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
359 360 361
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
362 363 364
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm
P
Peter Zijlstra 已提交
365
.macro RESTORE_CR3 scratch_reg:req save_reg:req
366 367 368 369
.endm

#endif

370 371
#endif /* CONFIG_X86_64 */

372 373 374 375 376 377 378 379 380 381 382 383 384
/*
 * This does 'call enter_from_user_mode' unless we can avoid it based on
 * kernel config or using the static jump infrastructure.
 */
.macro CALL_enter_from_user_mode
#ifdef CONFIG_CONTEXT_TRACKING
#ifdef HAVE_JUMP_LABEL
	STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
#endif
	call enter_from_user_mode
.Lafter_call_\@:
#endif
.endm