common.c 11.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11
/*
 * common.c - C code for kernel entry and exit
 * Copyright (c) 2015 Andrew Lutomirski
 *
 * Based on asm and ptrace code by many authors.  The code here originated
 * in ptrace.c and signal.c.
 */

#include <linux/kernel.h>
#include <linux/sched.h>
12
#include <linux/sched/task_stack.h>
13 14 15 16 17 18 19 20 21 22 23
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
24
#include <linux/nospec.h>
25
#include <linux/uprobes.h>
26
#include <linux/livepatch.h>
27
#include <linux/syscalls.h>
28
#include <linux/uaccess.h>
29 30 31

#include <asm/desc.h>
#include <asm/traps.h>
32
#include <asm/vdso.h>
33
#include <asm/cpufeature.h>
34
#include <asm/fpu/api.h>
35
#include <asm/nospec-branch.h>
36 37 38 39

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>

40 41
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
42
__visible inline void enter_from_user_mode(void)
43 44
{
	CT_WARN_ON(ct_state() != CONTEXT_USER);
45
	user_exit_irqoff();
46
}
47 48
#else
static inline void enter_from_user_mode(void) {}
49 50
#endif

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
	if (arch == AUDIT_ARCH_X86_64) {
		audit_syscall_entry(regs->orig_ax, regs->di,
				    regs->si, regs->dx, regs->r10);
	} else
#endif
	{
		audit_syscall_entry(regs->orig_ax, regs->bx,
				    regs->cx, regs->dx, regs->si);
	}
}

/*
66 67
 * Returns the syscall nr to run (which should match regs->orig_ax) or -1
 * to skip the syscall.
68
 */
69
static long syscall_trace_enter(struct pt_regs *regs)
70
{
71 72
	u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;

73
	struct thread_info *ti = current_thread_info();
74 75 76
	unsigned long ret = 0;
	u32 work;

77 78
	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
		BUG_ON(regs != task_pt_regs(current));
79

80
	work = READ_ONCE(ti->flags);
81

82 83 84 85 86
	if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
		ret = tracehook_report_syscall_entry(regs);
		if (ret || (work & _TIF_SYSCALL_EMU))
			return -1L;
	}
K
Kees Cook 已提交
87

88 89
#ifdef CONFIG_SECCOMP
	/*
K
Kees Cook 已提交
90
	 * Do seccomp after ptrace, to catch any tracer changes.
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
	 */
	if (work & _TIF_SECCOMP) {
		struct seccomp_data sd;

		sd.arch = arch;
		sd.nr = regs->orig_ax;
		sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
		if (arch == AUDIT_ARCH_X86_64) {
			sd.args[0] = regs->di;
			sd.args[1] = regs->si;
			sd.args[2] = regs->dx;
			sd.args[3] = regs->r10;
			sd.args[4] = regs->r8;
			sd.args[5] = regs->r9;
		} else
#endif
		{
			sd.args[0] = regs->bx;
			sd.args[1] = regs->cx;
			sd.args[2] = regs->dx;
			sd.args[3] = regs->si;
			sd.args[4] = regs->di;
			sd.args[5] = regs->bp;
		}

117 118 119
		ret = __secure_computing(&sd);
		if (ret == -1)
			return ret;
120 121 122 123 124 125 126 127 128 129 130
	}
#endif

	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
		trace_sys_enter(regs, regs->orig_ax);

	do_audit_syscall_entry(regs, arch);

	return ret ?: regs->orig_ax;
}

131 132
#define EXIT_TO_USERMODE_LOOP_FLAGS				\
	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
133
	 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
134

135 136
static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
137 138
	/*
	 * In order to return to user mode, we need to have IRQs off with
139
	 * none of EXIT_TO_USERMODE_LOOP_FLAGS set.  Several of these flags
I
Ingo Molnar 已提交
140
	 * can be set at any time on preemptible kernels if we have IRQs on,
141 142 143 144 145 146 147 148 149 150 151 152 153
	 * so we need to loop.  Disabling preemption wouldn't help: doing the
	 * work to clear some of the flags can sleep.
	 */
	while (true) {
		/* We have work to do. */
		local_irq_enable();

		if (cached_flags & _TIF_NEED_RESCHED)
			schedule();

		if (cached_flags & _TIF_UPROBE)
			uprobe_notify_resume(regs);

154 155 156
		if (cached_flags & _TIF_PATCH_PENDING)
			klp_update_patch_state(current);

157 158 159 160 161 162 163
		/* deal with pending signal delivery */
		if (cached_flags & _TIF_SIGPENDING)
			do_signal(regs);

		if (cached_flags & _TIF_NOTIFY_RESUME) {
			clear_thread_flag(TIF_NOTIFY_RESUME);
			tracehook_notify_resume(regs);
164
			rseq_handle_notify_resume(NULL, regs);
165 166 167 168 169 170 171
		}

		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
			fire_user_return_notifiers();

		/* Disable IRQs and retry */
		local_irq_disable();
172

173
		cached_flags = READ_ONCE(current_thread_info()->flags);
174 175 176

		if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
			break;
177
	}
178 179 180 181 182
}

/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
{
183
	struct thread_info *ti = current_thread_info();
184 185
	u32 cached_flags;

186 187
	addr_limit_user_check();

188
	lockdep_assert_irqs_disabled();
189 190
	lockdep_sys_exit();

191
	cached_flags = READ_ONCE(ti->flags);
192 193 194

	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
		exit_to_usermode_loop(regs, cached_flags);
195

196 197 198 199 200 201 202
	/* Reload ti->flags; we may have rescheduled above. */
	cached_flags = READ_ONCE(ti->flags);

	fpregs_assert_state_consistent();
	if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
		switch_fpu_return();

203 204 205 206 207 208 209
#ifdef CONFIG_COMPAT
	/*
	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
	 * returning to user mode.  We need to clear it *after* signal
	 * handling, because syscall restart has a fixup for compat
	 * syscalls.  The fixup is exercised by the ptrace_syscall_32
	 * selftest.
210 211 212 213
	 *
	 * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
	 * special case only applies after poking regs and before the
	 * very next return to user mode.
214
	 */
215
	ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
216 217
#endif

218
	user_enter_irqoff();
219 220

	mds_user_clear_cpu_buffers();
221 222
}

223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
#define SYSCALL_EXIT_WORK_FLAGS				\
	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |	\
	 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)

static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
{
	bool step;

	audit_syscall_exit(regs);

	if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
		trace_sys_exit(regs, regs->ax);

	/*
	 * If TIF_SYSCALL_EMU is set, we only get here because of
	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
	 * We already reported this syscall instruction in
	 * syscall_trace_enter().
	 */
	step = unlikely(
		(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
		== _TIF_SINGLESTEP);
	if (step || cached_flags & _TIF_SYSCALL_TRACE)
		tracehook_report_syscall_exit(regs, step);
}

249 250 251 252
/*
 * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
 * state such that we can immediately switch to user mode.
 */
253
__visible inline void syscall_return_slowpath(struct pt_regs *regs)
254
{
255
	struct thread_info *ti = current_thread_info();
256 257 258 259
	u32 cached_flags = READ_ONCE(ti->flags);

	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);

260 261
	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
	    WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
262 263
		local_irq_enable();

264 265
	rseq_syscall(regs);

266 267 268 269
	/*
	 * First do one-time work.  If these work items are enabled, we
	 * want to run them exactly once per syscall exit with IRQs on.
	 */
270 271
	if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
		syscall_slow_exit_work(regs, cached_flags);
272 273 274 275

	local_irq_disable();
	prepare_exit_to_usermode(regs);
}
276

277
#ifdef CONFIG_X86_64
278
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
279
{
280
	struct thread_info *ti;
281

282
	enter_from_user_mode();
283
	local_irq_enable();
284
	ti = current_thread_info();
285 286 287 288 289 290 291 292
	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
		nr = syscall_trace_enter(regs);

	/*
	 * NB: Native and x32 syscalls are dispatched from the same
	 * table.  The only functional difference is the x32 bit in
	 * regs->orig_ax, which changes the behavior of some syscalls.
	 */
293 294 295
	nr &= __SYSCALL_MASK;
	if (likely(nr < NR_syscalls)) {
		nr = array_index_nospec(nr, NR_syscalls);
296
		regs->ax = sys_call_table[nr](regs);
297 298 299 300 301 302
	}

	syscall_return_slowpath(regs);
}
#endif

303 304
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
305 306 307
 * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
 * all entry and exit work and returns with IRQs off.  This function is
 * extremely hot in workloads that use it, and it's usually called from
308
 * do_fast_syscall_32, so forcibly inline it to improve performance.
309
 */
310
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
311
{
312
	struct thread_info *ti = current_thread_info();
313 314 315
	unsigned int nr = (unsigned int)regs->orig_ax;

#ifdef CONFIG_IA32_EMULATION
316
	ti->status |= TS_COMPAT;
317 318 319 320 321 322 323 324 325 326 327 328
#endif

	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
		/*
		 * Subtlety here: if ptrace pokes something larger than
		 * 2^32-1 into orig_ax, this truncates it.  This may or
		 * may not be necessary, but it matches the old asm
		 * behavior.
		 */
		nr = syscall_trace_enter(regs);
	}

329
	if (likely(nr < IA32_NR_syscalls)) {
330
		nr = array_index_nospec(nr, IA32_NR_syscalls);
331
#ifdef CONFIG_IA32_EMULATION
332 333
		regs->ax = ia32_sys_call_table[nr](regs);
#else
334 335 336 337 338 339 340 341 342 343
		/*
		 * It's possible that a 32-bit syscall implementation
		 * takes a 64-bit parameter but nonetheless assumes that
		 * the high bits are zero.  Make sure we zero-extend all
		 * of the args.
		 */
		regs->ax = ia32_sys_call_table[nr](
			(unsigned int)regs->bx, (unsigned int)regs->cx,
			(unsigned int)regs->dx, (unsigned int)regs->si,
			(unsigned int)regs->di, (unsigned int)regs->bp);
344
#endif /* CONFIG_IA32_EMULATION */
345 346 347 348
	}

	syscall_return_slowpath(regs);
}
349

350 351
/* Handles int $0x80 */
__visible void do_int80_syscall_32(struct pt_regs *regs)
352
{
353
	enter_from_user_mode();
354 355 356 357
	local_irq_enable();
	do_syscall_32_irqs_on(regs);
}

358
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
359
__visible long do_fast_syscall_32(struct pt_regs *regs)
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
{
	/*
	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
	 * convention.  Adjust regs so it looks like we entered using int80.
	 */

	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
		vdso_image_32.sym_int80_landing_pad;

	/*
	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
	 * Fix it up.
	 */
	regs->ip = landing_pad;

376 377
	enter_from_user_mode();

378
	local_irq_enable();
379 380

	/* Fetch EBP from where the vDSO stashed it. */
381 382 383 384 385 386
	if (
#ifdef CONFIG_X86_64
		/*
		 * Micro-optimization: the pointer we're following is explicitly
		 * 32 bits, so it can't be out of range.
		 */
387
		__get_user(*(u32 *)&regs->bp,
388 389
			    (u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
390
		get_user(*(u32 *)&regs->bp,
391 392 393 394
			 (u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
		) {

395 396 397 398
		/* User code screwed up. */
		local_irq_disable();
		regs->ax = -EFAULT;
		prepare_exit_to_usermode(regs);
399
		return 0;	/* Keep it simple: use IRET. */
400 401 402
	}

	/* Now this is just like a normal syscall. */
403
	do_syscall_32_irqs_on(regs);
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418

#ifdef CONFIG_X86_64
	/*
	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
	 * bother with SYSEXIT.
	 *
	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
	 * because the ECX fixup above will ensure that this is essentially
	 * never the case.
	 */
	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
		regs->ip == landing_pad &&
		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
419 420 421 422 423 424 425 426 427 428 429 430 431 432
	/*
	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
	 *
	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
	 * because the ECX fixup above will ensure that this is essentially
	 * never the case.
	 *
	 * We don't allow syscalls at all from VM86 mode, but we still
	 * need to check VM, because we might be returning from sys_vm86.
	 */
	return static_cpu_has(X86_FEATURE_SEP) &&
		regs->cs == __USER_CS && regs->ss == __USER_DS &&
		regs->ip == landing_pad &&
		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
433
#endif
434
}
435
#endif