common.c 13.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11
/*
 * common.c - C code for kernel entry and exit
 * Copyright (c) 2015 Andrew Lutomirski
 *
 * Based on asm and ptrace code by many authors.  The code here originated
 * in ptrace.c and signal.c.
 */

#include <linux/kernel.h>
#include <linux/sched.h>
12
#include <linux/sched/task_stack.h>
13 14 15 16 17 18 19 20 21 22 23
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
24
#include <linux/nospec.h>
25
#include <linux/uprobes.h>
26
#include <linux/livepatch.h>
27
#include <linux/syscalls.h>
28
#include <linux/uaccess.h>
29 30 31

#include <asm/desc.h>
#include <asm/traps.h>
32
#include <asm/vdso.h>
33
#include <asm/cpufeature.h>
34
#include <asm/fpu/api.h>
35
#include <asm/nospec-branch.h>
36
#include <asm/io_bitmap.h>
37
#include <asm/syscall.h>
38 39 40 41

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>

42
#ifdef CONFIG_CONTEXT_TRACKING
43 44 45 46 47 48 49 50 51 52
/**
 * enter_from_user_mode - Establish state when coming from user mode
 *
 * Syscall entry disables interrupts, but user mode is traced as interrupts
 * enabled. Also with NO_HZ_FULL RCU might be idle.
 *
 * 1) Tell lockdep that interrupts are disabled
 * 2) Invoke context tracking if enabled to reactivate RCU
 * 3) Trace interrupts off state
 */
53
__visible noinstr void enter_from_user_mode(void)
54
{
55 56
	enum ctx_state state = ct_state();

57
	lockdep_hardirqs_off(CALLER_ADDR0);
58
	user_exit_irqoff();
59 60 61

	instrumentation_begin();
	CT_WARN_ON(state != CONTEXT_USER);
62
	trace_hardirqs_off_prepare();
63
	instrumentation_end();
64
}
65
#else
66 67 68 69 70 71 72
static __always_inline void enter_from_user_mode(void)
{
	lockdep_hardirqs_off(CALLER_ADDR0);
	instrumentation_begin();
	trace_hardirqs_off_prepare();
	instrumentation_end();
}
73 74
#endif

75 76 77 78 79 80 81 82 83 84 85 86
/**
 * exit_to_user_mode - Fixup state when exiting to user mode
 *
 * Syscall exit enables interrupts, but the kernel state is interrupts
 * disabled when this is invoked. Also tell RCU about it.
 *
 * 1) Trace interrupts on state
 * 2) Invoke context tracking if enabled to adjust RCU state
 * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on.
 * 4) Tell lockdep that interrupts are enabled
 */
static __always_inline void exit_to_user_mode(void)
87
{
88 89 90 91 92
	instrumentation_begin();
	trace_hardirqs_on_prepare();
	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
	instrumentation_end();

93 94
	user_enter_irqoff();
	mds_user_clear_cpu_buffers();
95
	lockdep_hardirqs_on(CALLER_ADDR0);
96 97
}

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
	if (arch == AUDIT_ARCH_X86_64) {
		audit_syscall_entry(regs->orig_ax, regs->di,
				    regs->si, regs->dx, regs->r10);
	} else
#endif
	{
		audit_syscall_entry(regs->orig_ax, regs->bx,
				    regs->cx, regs->dx, regs->si);
	}
}

/*
113 114
 * Returns the syscall nr to run (which should match regs->orig_ax) or -1
 * to skip the syscall.
115
 */
116
static long syscall_trace_enter(struct pt_regs *regs)
117
{
118 119
	u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;

120
	struct thread_info *ti = current_thread_info();
121 122 123
	unsigned long ret = 0;
	u32 work;

124 125
	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
		BUG_ON(regs != task_pt_regs(current));
126

127
	work = READ_ONCE(ti->flags);
128

129 130 131 132 133
	if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
		ret = tracehook_report_syscall_entry(regs);
		if (ret || (work & _TIF_SYSCALL_EMU))
			return -1L;
	}
K
Kees Cook 已提交
134

135 136
#ifdef CONFIG_SECCOMP
	/*
K
Kees Cook 已提交
137
	 * Do seccomp after ptrace, to catch any tracer changes.
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
	 */
	if (work & _TIF_SECCOMP) {
		struct seccomp_data sd;

		sd.arch = arch;
		sd.nr = regs->orig_ax;
		sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
		if (arch == AUDIT_ARCH_X86_64) {
			sd.args[0] = regs->di;
			sd.args[1] = regs->si;
			sd.args[2] = regs->dx;
			sd.args[3] = regs->r10;
			sd.args[4] = regs->r8;
			sd.args[5] = regs->r9;
		} else
#endif
		{
			sd.args[0] = regs->bx;
			sd.args[1] = regs->cx;
			sd.args[2] = regs->dx;
			sd.args[3] = regs->si;
			sd.args[4] = regs->di;
			sd.args[5] = regs->bp;
		}

164 165 166
		ret = __secure_computing(&sd);
		if (ret == -1)
			return ret;
167 168 169 170 171 172 173 174 175 176 177
	}
#endif

	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
		trace_sys_enter(regs, regs->orig_ax);

	do_audit_syscall_entry(regs, arch);

	return ret ?: regs->orig_ax;
}

178 179
#define EXIT_TO_USERMODE_LOOP_FLAGS				\
	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
180
	 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
181

182 183
static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
184 185
	/*
	 * In order to return to user mode, we need to have IRQs off with
186
	 * none of EXIT_TO_USERMODE_LOOP_FLAGS set.  Several of these flags
I
Ingo Molnar 已提交
187
	 * can be set at any time on preemptible kernels if we have IRQs on,
188 189 190 191 192 193 194 195 196 197 198 199 200
	 * so we need to loop.  Disabling preemption wouldn't help: doing the
	 * work to clear some of the flags can sleep.
	 */
	while (true) {
		/* We have work to do. */
		local_irq_enable();

		if (cached_flags & _TIF_NEED_RESCHED)
			schedule();

		if (cached_flags & _TIF_UPROBE)
			uprobe_notify_resume(regs);

201 202 203
		if (cached_flags & _TIF_PATCH_PENDING)
			klp_update_patch_state(current);

204 205 206 207 208 209 210
		/* deal with pending signal delivery */
		if (cached_flags & _TIF_SIGPENDING)
			do_signal(regs);

		if (cached_flags & _TIF_NOTIFY_RESUME) {
			clear_thread_flag(TIF_NOTIFY_RESUME);
			tracehook_notify_resume(regs);
211
			rseq_handle_notify_resume(NULL, regs);
212 213 214 215 216 217 218
		}

		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
			fire_user_return_notifiers();

		/* Disable IRQs and retry */
		local_irq_disable();
219

220
		cached_flags = READ_ONCE(current_thread_info()->flags);
221 222 223

		if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
			break;
224
	}
225 226
}

227
static void __prepare_exit_to_usermode(struct pt_regs *regs)
228
{
229
	struct thread_info *ti = current_thread_info();
230 231
	u32 cached_flags;

232 233
	addr_limit_user_check();

234
	lockdep_assert_irqs_disabled();
235 236
	lockdep_sys_exit();

237
	cached_flags = READ_ONCE(ti->flags);
238 239 240

	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
		exit_to_usermode_loop(regs, cached_flags);
241

242 243 244
	/* Reload ti->flags; we may have rescheduled above. */
	cached_flags = READ_ONCE(ti->flags);

245 246 247
	if (unlikely(cached_flags & _TIF_IO_BITMAP))
		tss_update_io_bitmap();

248 249 250 251
	fpregs_assert_state_consistent();
	if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
		switch_fpu_return();

252 253 254 255 256 257 258
#ifdef CONFIG_COMPAT
	/*
	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
	 * returning to user mode.  We need to clear it *after* signal
	 * handling, because syscall restart has a fixup for compat
	 * syscalls.  The fixup is exercised by the ptrace_syscall_32
	 * selftest.
259 260 261 262
	 *
	 * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
	 * special case only applies after poking regs and before the
	 * very next return to user mode.
263
	 */
264
	ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
265
#endif
266
}
267

268 269 270 271 272 273
__visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs)
{
	instrumentation_begin();
	__prepare_exit_to_usermode(regs);
	instrumentation_end();
	exit_to_user_mode();
274 275
}

276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
#define SYSCALL_EXIT_WORK_FLAGS				\
	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |	\
	 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)

static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
{
	bool step;

	audit_syscall_exit(regs);

	if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
		trace_sys_exit(regs, regs->ax);

	/*
	 * If TIF_SYSCALL_EMU is set, we only get here because of
	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
	 * We already reported this syscall instruction in
	 * syscall_trace_enter().
	 */
	step = unlikely(
		(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
		== _TIF_SINGLESTEP);
	if (step || cached_flags & _TIF_SYSCALL_TRACE)
		tracehook_report_syscall_exit(regs, step);
}

302
static void __syscall_return_slowpath(struct pt_regs *regs)
303
{
304
	struct thread_info *ti = current_thread_info();
305 306 307 308
	u32 cached_flags = READ_ONCE(ti->flags);

	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);

309 310
	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
	    WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
311 312
		local_irq_enable();

313 314
	rseq_syscall(regs);

315 316 317 318
	/*
	 * First do one-time work.  If these work items are enabled, we
	 * want to run them exactly once per syscall exit with IRQs on.
	 */
319 320
	if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
		syscall_slow_exit_work(regs, cached_flags);
321 322

	local_irq_disable();
323 324 325 326 327 328 329 330 331 332 333 334 335
	__prepare_exit_to_usermode(regs);
}

/*
 * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
 * state such that we can immediately switch to user mode.
 */
__visible noinstr void syscall_return_slowpath(struct pt_regs *regs)
{
	instrumentation_begin();
	__syscall_return_slowpath(regs);
	instrumentation_end();
	exit_to_user_mode();
336
}
337

338
#ifdef CONFIG_X86_64
339
__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
340
{
341
	struct thread_info *ti;
342

343
	enter_from_user_mode();
344 345
	instrumentation_begin();

346
	local_irq_enable();
347
	ti = current_thread_info();
348 349 350
	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
		nr = syscall_trace_enter(regs);

351 352
	if (likely(nr < NR_syscalls)) {
		nr = array_index_nospec(nr, NR_syscalls);
353
		regs->ax = sys_call_table[nr](regs);
354 355 356 357 358 359 360
#ifdef CONFIG_X86_X32_ABI
	} else if (likely((nr & __X32_SYSCALL_BIT) &&
			  (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
		nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
					X32_NR_syscalls);
		regs->ax = x32_sys_call_table[nr](regs);
#endif
361
	}
362
	__syscall_return_slowpath(regs);
363

364 365
	instrumentation_end();
	exit_to_user_mode();
366 367 368
}
#endif

369 370
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
371 372 373
 * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
 * all entry and exit work and returns with IRQs off.  This function is
 * extremely hot in workloads that use it, and it's usually called from
374
 * do_fast_syscall_32, so forcibly inline it to improve performance.
375
 */
376
static void do_syscall_32_irqs_on(struct pt_regs *regs)
377
{
378
	struct thread_info *ti = current_thread_info();
379 380 381
	unsigned int nr = (unsigned int)regs->orig_ax;

#ifdef CONFIG_IA32_EMULATION
382
	ti->status |= TS_COMPAT;
383 384 385 386 387 388 389 390 391 392 393 394
#endif

	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
		/*
		 * Subtlety here: if ptrace pokes something larger than
		 * 2^32-1 into orig_ax, this truncates it.  This may or
		 * may not be necessary, but it matches the old asm
		 * behavior.
		 */
		nr = syscall_trace_enter(regs);
	}

395
	if (likely(nr < IA32_NR_syscalls)) {
396
		nr = array_index_nospec(nr, IA32_NR_syscalls);
397
		regs->ax = ia32_sys_call_table[nr](regs);
398 399
	}

400
	__syscall_return_slowpath(regs);
401
}
402

403
/* Handles int $0x80 */
404
__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
405
{
406
	enter_from_user_mode();
407 408
	instrumentation_begin();

409 410
	local_irq_enable();
	do_syscall_32_irqs_on(regs);
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443

	instrumentation_end();
	exit_to_user_mode();
}

static bool __do_fast_syscall_32(struct pt_regs *regs)
{
	int res;

	/* Fetch EBP from where the vDSO stashed it. */
	if (IS_ENABLED(CONFIG_X86_64)) {
		/*
		 * Micro-optimization: the pointer we're following is
		 * explicitly 32 bits, so it can't be out of range.
		 */
		res = __get_user(*(u32 *)&regs->bp,
			 (u32 __user __force *)(unsigned long)(u32)regs->sp);
	} else {
		res = get_user(*(u32 *)&regs->bp,
		       (u32 __user __force *)(unsigned long)(u32)regs->sp);
	}

	if (res) {
		/* User code screwed up. */
		regs->ax = -EFAULT;
		local_irq_disable();
		__prepare_exit_to_usermode(regs);
		return false;
	}

	/* Now this is just like a normal syscall. */
	do_syscall_32_irqs_on(regs);
	return true;
444 445
}

446
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
447
__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
448 449 450 451 452 453
{
	/*
	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
	 * convention.  Adjust regs so it looks like we entered using int80.
	 */
	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
454 455
					vdso_image_32.sym_int80_landing_pad;
	bool success;
456 457 458 459 460 461 462 463

	/*
	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
	 * Fix it up.
	 */
	regs->ip = landing_pad;

464
	enter_from_user_mode();
465
	instrumentation_begin();
466

467
	local_irq_enable();
468
	success = __do_fast_syscall_32(regs);
469

470 471
	instrumentation_end();
	exit_to_user_mode();
472

473 474 475
	/* If it failed, keep it simple: use IRET. */
	if (!success)
		return 0;
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490

#ifdef CONFIG_X86_64
	/*
	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
	 * bother with SYSEXIT.
	 *
	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
	 * because the ECX fixup above will ensure that this is essentially
	 * never the case.
	 */
	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
		regs->ip == landing_pad &&
		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
491 492 493 494 495 496 497 498 499 500 501 502 503 504
	/*
	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
	 *
	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
	 * because the ECX fixup above will ensure that this is essentially
	 * never the case.
	 *
	 * We don't allow syscalls at all from VM86 mode, but we still
	 * need to check VM, because we might be returning from sys_vm86.
	 */
	return static_cpu_has(X86_FEATURE_SEP) &&
		regs->cs == __USER_CS && regs->ss == __USER_DS &&
		regs->ip == landing_pad &&
		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
505
#endif
506
}
507
#endif
508 509 510 511 512

SYSCALL_DEFINE0(ni_syscall)
{
	return -ENOSYS;
}