common.c 22.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9 10 11
/*
 * common.c - C code for kernel entry and exit
 * Copyright (c) 2015 Andrew Lutomirski
 *
 * Based on asm and ptrace code by many authors.  The code here originated
 * in ptrace.c and signal.c.
 */

#include <linux/kernel.h>
#include <linux/sched.h>
12
#include <linux/sched/task_stack.h>
13 14 15 16 17 18 19 20 21 22 23
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
24
#include <linux/nospec.h>
25
#include <linux/uprobes.h>
26
#include <linux/livepatch.h>
27
#include <linux/syscalls.h>
28
#include <linux/uaccess.h>
29

30 31 32 33 34
#ifdef CONFIG_XEN_PV
#include <xen/xen-ops.h>
#include <xen/events.h>
#endif

35 36
#include <asm/desc.h>
#include <asm/traps.h>
37
#include <asm/vdso.h>
38
#include <asm/cpufeature.h>
39
#include <asm/fpu/api.h>
40
#include <asm/nospec-branch.h>
41
#include <asm/io_bitmap.h>
42
#include <asm/syscall.h>
43
#include <asm/irq_stack.h>
44 45 46 47

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>

48
/* Check that the stack and regs on entry from user mode are sane. */
49
static noinstr void check_user_regs(struct pt_regs *regs)
50 51
{
	if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
		/*
		 * Make sure that the entry code gave us a sensible EFLAGS
		 * register.  Native because we want to check the actual CPU
		 * state, not the interrupt state as imagined by Xen.
		 */
		unsigned long flags = native_save_fl();
		WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF |
				      X86_EFLAGS_NT));

		/* We think we came from user mode. Make sure pt_regs agrees. */
		WARN_ON_ONCE(!user_mode(regs));

		/*
		 * All entries from user mode (except #DF) should be on the
		 * normal thread stack and should have user pt_regs in the
		 * correct location.
		 */
69 70 71 72 73
		WARN_ON_ONCE(!on_thread_stack());
		WARN_ON_ONCE(regs != task_pt_regs(current));
	}
}

74
#ifdef CONFIG_CONTEXT_TRACKING
75 76 77 78 79 80 81 82 83 84
/**
 * enter_from_user_mode - Establish state when coming from user mode
 *
 * Syscall entry disables interrupts, but user mode is traced as interrupts
 * enabled. Also with NO_HZ_FULL RCU might be idle.
 *
 * 1) Tell lockdep that interrupts are disabled
 * 2) Invoke context tracking if enabled to reactivate RCU
 * 3) Trace interrupts off state
 */
85
static noinstr void enter_from_user_mode(void)
86
{
87 88
	enum ctx_state state = ct_state();

89
	lockdep_hardirqs_off(CALLER_ADDR0);
90
	user_exit_irqoff();
91 92 93

	instrumentation_begin();
	CT_WARN_ON(state != CONTEXT_USER);
94
	trace_hardirqs_off_finish();
95
	instrumentation_end();
96
}
97
#else
98 99 100 101
static __always_inline void enter_from_user_mode(void)
{
	lockdep_hardirqs_off(CALLER_ADDR0);
	instrumentation_begin();
102
	trace_hardirqs_off_finish();
103 104
	instrumentation_end();
}
105 106
#endif

107 108 109 110 111 112 113 114 115 116 117 118
/**
 * exit_to_user_mode - Fixup state when exiting to user mode
 *
 * Syscall exit enables interrupts, but the kernel state is interrupts
 * disabled when this is invoked. Also tell RCU about it.
 *
 * 1) Trace interrupts on state
 * 2) Invoke context tracking if enabled to adjust RCU state
 * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on.
 * 4) Tell lockdep that interrupts are enabled
 */
static __always_inline void exit_to_user_mode(void)
119
{
120 121 122 123 124
	instrumentation_begin();
	trace_hardirqs_on_prepare();
	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
	instrumentation_end();

125 126
	user_enter_irqoff();
	mds_user_clear_cpu_buffers();
127
	lockdep_hardirqs_on(CALLER_ADDR0);
128 129
}

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
	if (arch == AUDIT_ARCH_X86_64) {
		audit_syscall_entry(regs->orig_ax, regs->di,
				    regs->si, regs->dx, regs->r10);
	} else
#endif
	{
		audit_syscall_entry(regs->orig_ax, regs->bx,
				    regs->cx, regs->dx, regs->si);
	}
}

/*
145 146
 * Returns the syscall nr to run (which should match regs->orig_ax) or -1
 * to skip the syscall.
147
 */
148
static long syscall_trace_enter(struct pt_regs *regs)
149
{
150 151
	u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;

152
	struct thread_info *ti = current_thread_info();
153 154 155
	unsigned long ret = 0;
	u32 work;

156
	work = READ_ONCE(ti->flags);
157

158 159 160 161 162
	if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
		ret = tracehook_report_syscall_entry(regs);
		if (ret || (work & _TIF_SYSCALL_EMU))
			return -1L;
	}
K
Kees Cook 已提交
163

164 165
#ifdef CONFIG_SECCOMP
	/*
K
Kees Cook 已提交
166
	 * Do seccomp after ptrace, to catch any tracer changes.
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
	 */
	if (work & _TIF_SECCOMP) {
		struct seccomp_data sd;

		sd.arch = arch;
		sd.nr = regs->orig_ax;
		sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
		if (arch == AUDIT_ARCH_X86_64) {
			sd.args[0] = regs->di;
			sd.args[1] = regs->si;
			sd.args[2] = regs->dx;
			sd.args[3] = regs->r10;
			sd.args[4] = regs->r8;
			sd.args[5] = regs->r9;
		} else
#endif
		{
			sd.args[0] = regs->bx;
			sd.args[1] = regs->cx;
			sd.args[2] = regs->dx;
			sd.args[3] = regs->si;
			sd.args[4] = regs->di;
			sd.args[5] = regs->bp;
		}

193 194 195
		ret = __secure_computing(&sd);
		if (ret == -1)
			return ret;
196 197 198 199 200 201 202 203 204 205 206
	}
#endif

	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
		trace_sys_enter(regs, regs->orig_ax);

	do_audit_syscall_entry(regs, arch);

	return ret ?: regs->orig_ax;
}

207 208
#define EXIT_TO_USERMODE_LOOP_FLAGS				\
	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
209
	 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
210

211 212
static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
213 214
	/*
	 * In order to return to user mode, we need to have IRQs off with
215
	 * none of EXIT_TO_USERMODE_LOOP_FLAGS set.  Several of these flags
I
Ingo Molnar 已提交
216
	 * can be set at any time on preemptible kernels if we have IRQs on,
217 218 219 220 221 222 223 224 225 226 227 228 229
	 * so we need to loop.  Disabling preemption wouldn't help: doing the
	 * work to clear some of the flags can sleep.
	 */
	while (true) {
		/* We have work to do. */
		local_irq_enable();

		if (cached_flags & _TIF_NEED_RESCHED)
			schedule();

		if (cached_flags & _TIF_UPROBE)
			uprobe_notify_resume(regs);

230 231 232
		if (cached_flags & _TIF_PATCH_PENDING)
			klp_update_patch_state(current);

233 234 235 236 237 238 239
		/* deal with pending signal delivery */
		if (cached_flags & _TIF_SIGPENDING)
			do_signal(regs);

		if (cached_flags & _TIF_NOTIFY_RESUME) {
			clear_thread_flag(TIF_NOTIFY_RESUME);
			tracehook_notify_resume(regs);
240
			rseq_handle_notify_resume(NULL, regs);
241 242 243 244 245 246 247
		}

		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
			fire_user_return_notifiers();

		/* Disable IRQs and retry */
		local_irq_disable();
248

249
		cached_flags = READ_ONCE(current_thread_info()->flags);
250 251 252

		if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
			break;
253
	}
254 255
}

256
static void __prepare_exit_to_usermode(struct pt_regs *regs)
257
{
258
	struct thread_info *ti = current_thread_info();
259 260
	u32 cached_flags;

261 262
	addr_limit_user_check();

263
	lockdep_assert_irqs_disabled();
264 265
	lockdep_sys_exit();

266
	cached_flags = READ_ONCE(ti->flags);
267 268 269

	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
		exit_to_usermode_loop(regs, cached_flags);
270

271 272 273
	/* Reload ti->flags; we may have rescheduled above. */
	cached_flags = READ_ONCE(ti->flags);

274 275 276
	if (unlikely(cached_flags & _TIF_IO_BITMAP))
		tss_update_io_bitmap();

277 278 279 280
	fpregs_assert_state_consistent();
	if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
		switch_fpu_return();

281 282 283 284 285 286 287
#ifdef CONFIG_COMPAT
	/*
	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
	 * returning to user mode.  We need to clear it *after* signal
	 * handling, because syscall restart has a fixup for compat
	 * syscalls.  The fixup is exercised by the ptrace_syscall_32
	 * selftest.
288 289 290 291
	 *
	 * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
	 * special case only applies after poking regs and before the
	 * very next return to user mode.
292
	 */
293
	ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
294
#endif
295
}
296

297
static noinstr void prepare_exit_to_usermode(struct pt_regs *regs)
298 299 300 301 302
{
	instrumentation_begin();
	__prepare_exit_to_usermode(regs);
	instrumentation_end();
	exit_to_user_mode();
303 304
}

305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
#define SYSCALL_EXIT_WORK_FLAGS				\
	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |	\
	 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)

static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
{
	bool step;

	audit_syscall_exit(regs);

	if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
		trace_sys_exit(regs, regs->ax);

	/*
	 * If TIF_SYSCALL_EMU is set, we only get here because of
	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
	 * We already reported this syscall instruction in
	 * syscall_trace_enter().
	 */
	step = unlikely(
		(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
		== _TIF_SINGLESTEP);
	if (step || cached_flags & _TIF_SYSCALL_TRACE)
		tracehook_report_syscall_exit(regs, step);
}

331
static void __syscall_return_slowpath(struct pt_regs *regs)
332
{
333
	struct thread_info *ti = current_thread_info();
334 335 336 337
	u32 cached_flags = READ_ONCE(ti->flags);

	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);

338 339
	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
	    WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
340 341
		local_irq_enable();

342 343
	rseq_syscall(regs);

344 345 346 347
	/*
	 * First do one-time work.  If these work items are enabled, we
	 * want to run them exactly once per syscall exit with IRQs on.
	 */
348 349
	if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
		syscall_slow_exit_work(regs, cached_flags);
350 351

	local_irq_disable();
352 353 354 355 356 357 358 359 360 361 362 363 364
	__prepare_exit_to_usermode(regs);
}

/*
 * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
 * state such that we can immediately switch to user mode.
 */
__visible noinstr void syscall_return_slowpath(struct pt_regs *regs)
{
	instrumentation_begin();
	__syscall_return_slowpath(regs);
	instrumentation_end();
	exit_to_user_mode();
365
}
366

367
#ifdef CONFIG_X86_64
368
__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
369
{
370
	struct thread_info *ti;
371

372 373
	check_user_regs(regs);

374
	enter_from_user_mode();
375 376
	instrumentation_begin();

377
	local_irq_enable();
378
	ti = current_thread_info();
379 380 381
	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
		nr = syscall_trace_enter(regs);

382 383
	if (likely(nr < NR_syscalls)) {
		nr = array_index_nospec(nr, NR_syscalls);
384
		regs->ax = sys_call_table[nr](regs);
385 386 387 388 389 390 391
#ifdef CONFIG_X86_X32_ABI
	} else if (likely((nr & __X32_SYSCALL_BIT) &&
			  (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
		nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
					X32_NR_syscalls);
		regs->ax = x32_sys_call_table[nr](regs);
#endif
392
	}
393
	__syscall_return_slowpath(regs);
394

395 396
	instrumentation_end();
	exit_to_user_mode();
397 398 399
}
#endif

400 401
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
402 403 404
 * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
 * all entry and exit work and returns with IRQs off.  This function is
 * extremely hot in workloads that use it, and it's usually called from
405
 * do_fast_syscall_32, so forcibly inline it to improve performance.
406
 */
407
static void do_syscall_32_irqs_on(struct pt_regs *regs)
408
{
409
	struct thread_info *ti = current_thread_info();
410 411 412
	unsigned int nr = (unsigned int)regs->orig_ax;

#ifdef CONFIG_IA32_EMULATION
413
	ti->status |= TS_COMPAT;
414 415 416 417 418 419 420 421 422 423 424 425
#endif

	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
		/*
		 * Subtlety here: if ptrace pokes something larger than
		 * 2^32-1 into orig_ax, this truncates it.  This may or
		 * may not be necessary, but it matches the old asm
		 * behavior.
		 */
		nr = syscall_trace_enter(regs);
	}

426
	if (likely(nr < IA32_NR_syscalls)) {
427
		nr = array_index_nospec(nr, IA32_NR_syscalls);
428
		regs->ax = ia32_sys_call_table[nr](regs);
429 430
	}

431
	__syscall_return_slowpath(regs);
432
}
433

434
/* Handles int $0x80 */
435
__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
436
{
437 438
	check_user_regs(regs);

439
	enter_from_user_mode();
440 441
	instrumentation_begin();

442 443
	local_irq_enable();
	do_syscall_32_irqs_on(regs);
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476

	instrumentation_end();
	exit_to_user_mode();
}

static bool __do_fast_syscall_32(struct pt_regs *regs)
{
	int res;

	/* Fetch EBP from where the vDSO stashed it. */
	if (IS_ENABLED(CONFIG_X86_64)) {
		/*
		 * Micro-optimization: the pointer we're following is
		 * explicitly 32 bits, so it can't be out of range.
		 */
		res = __get_user(*(u32 *)&regs->bp,
			 (u32 __user __force *)(unsigned long)(u32)regs->sp);
	} else {
		res = get_user(*(u32 *)&regs->bp,
		       (u32 __user __force *)(unsigned long)(u32)regs->sp);
	}

	if (res) {
		/* User code screwed up. */
		regs->ax = -EFAULT;
		local_irq_disable();
		__prepare_exit_to_usermode(regs);
		return false;
	}

	/* Now this is just like a normal syscall. */
	do_syscall_32_irqs_on(regs);
	return true;
477 478
}

479
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
480
__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
481 482 483 484 485 486
{
	/*
	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
	 * convention.  Adjust regs so it looks like we entered using int80.
	 */
	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
487 488
					vdso_image_32.sym_int80_landing_pad;
	bool success;
489

490 491
	check_user_regs(regs);

492 493 494 495 496 497 498
	/*
	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
	 * Fix it up.
	 */
	regs->ip = landing_pad;

499
	enter_from_user_mode();
500
	instrumentation_begin();
501

502
	local_irq_enable();
503
	success = __do_fast_syscall_32(regs);
504

505 506
	instrumentation_end();
	exit_to_user_mode();
507

508 509 510
	/* If it failed, keep it simple: use IRET. */
	if (!success)
		return 0;
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525

#ifdef CONFIG_X86_64
	/*
	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
	 * bother with SYSEXIT.
	 *
	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
	 * because the ECX fixup above will ensure that this is essentially
	 * never the case.
	 */
	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
		regs->ip == landing_pad &&
		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
526 527 528 529 530 531 532 533 534 535 536 537 538 539
	/*
	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
	 *
	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
	 * because the ECX fixup above will ensure that this is essentially
	 * never the case.
	 *
	 * We don't allow syscalls at all from VM86 mode, but we still
	 * need to check VM, because we might be returning from sys_vm86.
	 */
	return static_cpu_has(X86_FEATURE_SEP) &&
		regs->cs == __USER_CS && regs->ss == __USER_DS &&
		regs->ip == landing_pad &&
		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
540
#endif
541
}
542 543 544 545 546 547 548 549 550 551 552 553

/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
{
	/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
	regs->sp = regs->bp;

	/* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
	regs->flags |= X86_EFLAGS_IF;

	return do_fast_syscall_32(regs);
}
554
#endif
555 556 557 558 559

SYSCALL_DEFINE0(ni_syscall)
{
	return -ENOSYS;
}
560 561

/**
562 563
 * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional
 *			     RCU handling
564 565 566 567 568 569 570 571 572 573
 * @regs:	Pointer to pt_regs of interrupted context
 *
 * Invokes:
 *  - lockdep irqflag state tracking as low level ASM entry disabled
 *    interrupts.
 *
 *  - Context tracking if the exception hit user mode.
 *
 *  - The hardirq tracer to keep the state consistent as low level ASM
 *    entry disabled interrupts.
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
 *
 * For kernel mode entries RCU handling is done conditional. If RCU is
 * watching then the only RCU requirement is to check whether the tick has
 * to be restarted. If RCU is not watching then rcu_irq_enter() has to be
 * invoked on entry and rcu_irq_exit() on exit.
 *
 * Avoiding the rcu_irq_enter/exit() calls is an optimization but also
 * solves the problem of kernel mode pagefaults which can schedule, which
 * is not possible after invoking rcu_irq_enter() without undoing it.
 *
 * For user mode entries enter_from_user_mode() must be invoked to
 * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
 * would not be possible.
 *
 * Returns: True if RCU has been adjusted on a kernel entry
 *	    False otherwise
 *
 * The return value must be fed into the rcu_exit argument of
 * idtentry_exit_cond_rcu().
593
 */
594
bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
595 596
{
	if (user_mode(regs)) {
597
		check_user_regs(regs);
598
		enter_from_user_mode();
599 600 601
		return false;
	}

602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
	/*
	 * If this entry hit the idle task invoke rcu_irq_enter() whether
	 * RCU is watching or not.
	 *
	 * Interupts can nest when the first interrupt invokes softirq
	 * processing on return which enables interrupts.
	 *
	 * Scheduler ticks in the idle task can mark quiescent state and
	 * terminate a grace period, if and only if the timer interrupt is
	 * not nested into another interrupt.
	 *
	 * Checking for __rcu_is_watching() here would prevent the nesting
	 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
	 * assume that it is the first interupt and eventually claim
	 * quiescient state and end grace periods prematurely.
	 *
	 * Unconditionally invoke rcu_irq_enter() so RCU state stays
	 * consistent.
	 *
	 * TINY_RCU does not support EQS, so let the compiler eliminate
	 * this part when enabled.
	 */
	if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
626 627 628 629 630
		/*
		 * If RCU is not watching then the same careful
		 * sequence vs. lockdep and tracing is required
		 * as in enter_from_user_mode().
		 */
631 632 633
		lockdep_hardirqs_off(CALLER_ADDR0);
		rcu_irq_enter();
		instrumentation_begin();
634
		trace_hardirqs_off_finish();
635
		instrumentation_end();
636 637

		return true;
638
	}
639 640

	/*
641 642 643 644
	 * If RCU is watching then RCU only wants to check whether it needs
	 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
	 * already contains a warning when RCU is not watching, so no point
	 * in having another one here.
645 646 647 648 649 650 651 652
	 */
	instrumentation_begin();
	rcu_irq_enter_check_tick();
	/* Use the combo lockdep/tracing function */
	trace_hardirqs_off();
	instrumentation_end();

	return false;
653 654
}

655 656 657 658 659 660 661 662 663 664 665 666 667 668
static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched)
{
	if (may_sched && !preempt_count()) {
		/* Sanity check RCU and thread stack */
		rcu_irq_exit_check_preempt();
		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
			WARN_ON_ONCE(!on_thread_stack());
		if (need_resched())
			preempt_schedule_irq();
	}
	/* Covers both tracing and lockdep */
	trace_hardirqs_on();
}

669
/**
670 671
 * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU
 *			    handling
672
 * @regs:	Pointer to pt_regs (exception entry regs)
673
 * @rcu_exit:	Invoke rcu_irq_exit() if true
674 675
 *
 * Depending on the return target (kernel/user) this runs the necessary
676
 * preemption and work checks if possible and reguired and returns to
677 678 679 680 681
 * the caller with interrupts disabled and no further work pending.
 *
 * This is the last action before returning to the low level ASM code which
 * just needs to return to the appropriate context.
 *
682 683
 * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry
 * function must be fed into the @rcu_exit argument.
684
 */
685
void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit)
686 687 688 689 690 691 692
{
	lockdep_assert_irqs_disabled();

	/* Check whether this returns to user mode */
	if (user_mode(regs)) {
		prepare_exit_to_usermode(regs);
	} else if (regs->flags & X86_EFLAGS_IF) {
693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
		/*
		 * If RCU was not watching on entry this needs to be done
		 * carefully and needs the same ordering of lockdep/tracing
		 * and RCU as the return to user mode path.
		 */
		if (rcu_exit) {
			instrumentation_begin();
			/* Tell the tracer that IRET will enable interrupts */
			trace_hardirqs_on_prepare();
			lockdep_hardirqs_on_prepare(CALLER_ADDR0);
			instrumentation_end();
			rcu_irq_exit();
			lockdep_hardirqs_on(CALLER_ADDR0);
			return;
		}

		instrumentation_begin();
710
		idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION));
711 712
		instrumentation_end();
	} else {
713 714 715 716
		/*
		 * IRQ flags state is correct already. Just tell RCU if it
		 * was not watching on entry.
		 */
717 718
		if (rcu_exit)
			rcu_irq_exit();
719 720
	}
}
721 722 723 724 725 726 727 728 729 730

/**
 * idtentry_enter_user - Handle state tracking on idtentry from user mode
 * @regs:	Pointer to pt_regs of interrupted context
 *
 * Invokes enter_from_user_mode() to establish the proper context for
 * NOHZ_FULL. Otherwise scheduling on exit would not be possible.
 */
void noinstr idtentry_enter_user(struct pt_regs *regs)
{
731
	check_user_regs(regs);
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
	enter_from_user_mode();
}

/**
 * idtentry_exit_user - Handle return from exception to user mode
 * @regs:	Pointer to pt_regs (exception entry regs)
 *
 * Runs the necessary preemption and work checks and returns to the caller
 * with interrupts disabled and no further work pending.
 *
 * This is the last action before returning to the low level ASM code which
 * just needs to return to the appropriate context.
 *
 * Counterpart to idtentry_enter_user().
 */
void noinstr idtentry_exit_user(struct pt_regs *regs)
{
	lockdep_assert_irqs_disabled();

	prepare_exit_to_usermode(regs);
}
753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824

#ifdef CONFIG_XEN_PV
#ifndef CONFIG_PREEMPTION
/*
 * Some hypercalls issued by the toolstack can take many 10s of
 * seconds. Allow tasks running hypercalls via the privcmd driver to
 * be voluntarily preempted even if full kernel preemption is
 * disabled.
 *
 * Such preemptible hypercalls are bracketed by
 * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
 * calls.
 */
DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);

/*
 * In case of scheduling the flag must be cleared and restored after
 * returning from schedule as the task might move to a different CPU.
 */
static __always_inline bool get_and_clear_inhcall(void)
{
	bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);

	__this_cpu_write(xen_in_preemptible_hcall, false);
	return inhcall;
}

static __always_inline void restore_inhcall(bool inhcall)
{
	__this_cpu_write(xen_in_preemptible_hcall, inhcall);
}
#else
static __always_inline bool get_and_clear_inhcall(void) { return false; }
static __always_inline void restore_inhcall(bool inhcall) { }
#endif

static void __xen_pv_evtchn_do_upcall(void)
{
	irq_enter_rcu();
	inc_irq_stat(irq_hv_callback_count);

	xen_hvm_evtchn_do_upcall();

	irq_exit_rcu();
}

__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
{
	struct pt_regs *old_regs;
	bool inhcall, rcu_exit;

	rcu_exit = idtentry_enter_cond_rcu(regs);
	old_regs = set_irq_regs(regs);

	instrumentation_begin();
	run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs);
	instrumentation_begin();

	set_irq_regs(old_regs);

	inhcall = get_and_clear_inhcall();
	if (inhcall && !WARN_ON_ONCE(rcu_exit)) {
		instrumentation_begin();
		idtentry_exit_cond_resched(regs, true);
		instrumentation_end();
		restore_inhcall(inhcall);
	} else {
		idtentry_exit_cond_rcu(regs, rcu_exit);
	}
}
#endif /* CONFIG_XEN_PV */