diff --git a/MAINTAINERS b/MAINTAINERS index aa635837a6af900fd074f7d94366fd883d8f5cb1..a384243d911b7a60c109edd60593a1365faf8d18 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11976,6 +11976,17 @@ F: include/dt-bindings/reset/ F: include/linux/reset.h F: include/linux/reset-controller.h +RESTARTABLE SEQUENCES SUPPORT +M: Mathieu Desnoyers +M: Peter Zijlstra +M: "Paul E. McKenney" +M: Boqun Feng +L: linux-kernel@vger.kernel.org +S: Supported +F: kernel/rseq.c +F: include/uapi/linux/rseq.h +F: include/trace/events/rseq.h + RFKILL M: Johannes Berg L: linux-wireless@vger.kernel.org diff --git a/arch/Kconfig b/arch/Kconfig index b695a3e3e92216fae9b7165093240622cee10819..095ba99968c1d135ee79d30141ae07a626381c0b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -272,6 +272,13 @@ config HAVE_REGS_AND_STACK_ACCESS_API declared in asm/ptrace.h For example the kprobes-based event tracer needs this API. +config HAVE_RSEQ + bool + depends on HAVE_REGS_AND_STACK_ACCESS_API + help + This symbol should be selected by an architecture if it + supports an implementation of restartable sequences. + config HAVE_CLK bool help diff --git a/fs/exec.c b/fs/exec.c index 183059c427b9c5552fc29d5eb01f90891930240f..2c3911612b229f614fd5a96124d8bb1f27ccf108 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1822,6 +1822,7 @@ static int do_execveat_common(int fd, struct filename *filename, current->fs->in_exec = 0; current->in_execve = 0; membarrier_execve(current); + rseq_execve(current); acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); diff --git a/include/linux/sched.h b/include/linux/sched.h index 14e4f9c123371e70eb75ce1b3ba1dbfe5ff2c6f8..3aa4fcb74e761dfda361f17d09593ecd9c361646 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -27,6 +27,7 @@ #include #include #include +#include /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -1047,6 +1048,17 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_RSEQ + struct rseq __user *rseq; + u32 rseq_len; + u32 rseq_sig; + /* + * RmW on rseq_event_mask must be performed atomically + * with respect to preemption. + */ + unsigned long rseq_event_mask; +#endif + struct tlbflush_unmap_batch tlb_ubc; struct rcu_head rcu; @@ -1757,4 +1769,126 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#ifdef CONFIG_RSEQ + +/* + * Map the event mask on the user-space ABI enum rseq_cs_flags + * for direct mask checks. + */ +enum rseq_event_mask_bits { + RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, + RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, + RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, +}; + +enum rseq_event_mask { + RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), + RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), + RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), +}; + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ + if (t->rseq) + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} + +void __rseq_handle_notify_resume(struct pt_regs *regs); + +static inline void rseq_handle_notify_resume(struct pt_regs *regs) +{ + if (current->rseq) + __rseq_handle_notify_resume(regs); +} + +static inline void rseq_signal_deliver(struct pt_regs *regs) +{ + preempt_disable(); + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + preempt_enable(); + rseq_handle_notify_resume(regs); +} + +/* rseq_preempt() requires preemption to be disabled. */ +static inline void rseq_preempt(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* rseq_migrate() requires preemption to be disabled. */ +static inline void rseq_migrate(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* + * If parent process has a registered restartable sequences area, the + * child inherits. Only applies when forking a process, not a thread. In + * case a parent fork() in the middle of a restartable sequence, set the + * resume notifier to force the child to retry. + */ +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ + if (clone_flags & CLONE_THREAD) { + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; + } else { + t->rseq = current->rseq; + t->rseq_len = current->rseq_len; + t->rseq_sig = current->rseq_sig; + t->rseq_event_mask = current->rseq_event_mask; + rseq_preempt(t); + } +} + +static inline void rseq_execve(struct task_struct *t) +{ + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; +} + +#else + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ +} +static inline void rseq_handle_notify_resume(struct pt_regs *regs) +{ +} +static inline void rseq_signal_deliver(struct pt_regs *regs) +{ +} +static inline void rseq_preempt(struct task_struct *t) +{ +} +static inline void rseq_migrate(struct task_struct *t) +{ +} +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ +} +static inline void rseq_execve(struct task_struct *t) +{ +} + +#endif + +#ifdef CONFIG_DEBUG_RSEQ + +void rseq_syscall(struct pt_regs *regs); + +#else + +static inline void rseq_syscall(struct pt_regs *regs) +{ +} + +#endif + #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 390e814fdc8d9acb032eee625b99e9902d5351c2..73810808cdf266e5cdcfc1e0c6b3af126a0bf4b1 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -66,6 +66,7 @@ struct old_linux_dirent; struct perf_event_attr; struct file_handle; struct sigaltstack; +struct rseq; union bpf_attr; #include @@ -897,7 +898,8 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val); asmlinkage long sys_pkey_free(int pkey); asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); - +asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, + int flags, uint32_t sig); /* * Architecture-specific system calls diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h new file mode 100644 index 0000000000000000000000000000000000000000..a04a64bc1a000422f6480521fd74a7b65756893c --- /dev/null +++ b/include/trace/events/rseq.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rseq + +#if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RSEQ_H + +#include +#include + +TRACE_EVENT(rseq_update, + + TP_PROTO(struct task_struct *t), + + TP_ARGS(t), + + TP_STRUCT__entry( + __field(s32, cpu_id) + ), + + TP_fast_assign( + __entry->cpu_id = raw_smp_processor_id(); + ), + + TP_printk("cpu_id=%d", __entry->cpu_id) +); + +TRACE_EVENT(rseq_ip_fixup, + + TP_PROTO(unsigned long regs_ip, unsigned long start_ip, + unsigned long post_commit_offset, unsigned long abort_ip), + + TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip), + + TP_STRUCT__entry( + __field(unsigned long, regs_ip) + __field(unsigned long, start_ip) + __field(unsigned long, post_commit_offset) + __field(unsigned long, abort_ip) + ), + + TP_fast_assign( + __entry->regs_ip = regs_ip; + __entry->start_ip = start_ip; + __entry->post_commit_offset = post_commit_offset; + __entry->abort_ip = abort_ip; + ), + + TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx", + __entry->regs_ip, __entry->start_ip, + __entry->post_commit_offset, __entry->abort_ip) +); + +#endif /* _TRACE_SOCK_H */ + +/* This part must be outside protection */ +#include diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h new file mode 100644 index 0000000000000000000000000000000000000000..d620fa43756cab2685428861f31c27d9a59b2a39 --- /dev/null +++ b/include/uapi/linux/rseq.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_RSEQ_H +#define _UAPI_LINUX_RSEQ_H + +/* + * linux/rseq.h + * + * Restartable sequences system call API + * + * Copyright (c) 2015-2018 Mathieu Desnoyers + */ + +#ifdef __KERNEL__ +# include +#else +# include +#endif + +#include + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +enum rseq_flags { + RSEQ_FLAG_UNREGISTER = (1 << 0), +}; + +enum rseq_cs_flags_bit { + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, +}; + +enum rseq_cs_flags { + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = + (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = + (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = + (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), +}; + +/* + * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always + * contained within a single cache-line. It is usually declared as + * link-time constant data. + */ +struct rseq_cs { + /* Version of this structure. */ + __u32 version; + /* enum rseq_cs_flags */ + __u32 flags; + LINUX_FIELD_u32_u64(start_ip); + /* Offset from start_ip. */ + LINUX_FIELD_u32_u64(post_commit_offset); + LINUX_FIELD_u32_u64(abort_ip); +} __attribute__((aligned(4 * sizeof(__u64)))); + +/* + * struct rseq is aligned on 4 * 8 bytes to ensure it is always + * contained within a single cache-line. + * + * A single struct rseq per thread is allowed. + */ +struct rseq { + /* + * Restartable sequences cpu_id_start field. Updated by the + * kernel, and read by user-space with single-copy atomicity + * semantics. Aligned on 32-bit. Always contains a value in the + * range of possible CPUs, although the value may not be the + * actual current CPU (e.g. if rseq is not initialized). This + * CPU number value should always be compared against the value + * of the cpu_id field before performing a rseq commit or + * returning a value read from a data structure indexed using + * the cpu_id_start value. + */ + __u32 cpu_id_start; + /* + * Restartable sequences cpu_id field. Updated by the kernel, + * and read by user-space with single-copy atomicity semantics. + * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and + * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the + * former means "rseq uninitialized", and latter means "rseq + * initialization failed". This value is meant to be read within + * rseq critical sections and compared with the cpu_id_start + * value previously read, before performing the commit instruction, + * or read and compared with the cpu_id_start value before returning + * a value loaded from a data structure indexed using the + * cpu_id_start value. + */ + __u32 cpu_id; + /* + * Restartable sequences rseq_cs field. + * + * Contains NULL when no critical section is active for the current + * thread, or holds a pointer to the currently active struct rseq_cs. + * + * Updated by user-space, which sets the address of the currently + * active rseq_cs at the beginning of assembly instruction sequence + * block, and set to NULL by the kernel when it restarts an assembly + * instruction sequence block, as well as when the kernel detects that + * it is preempting or delivering a signal outside of the range + * targeted by the rseq_cs. Also needs to be set to NULL by user-space + * before reclaiming memory that contains the targeted struct rseq_cs. + * + * Read and set by the kernel with single-copy atomicity semantics. + * Set by user-space with single-copy atomicity semantics. Aligned + * on 64-bit. + */ + LINUX_FIELD_u32_u64(rseq_cs); + /* + * - RSEQ_DISABLE flag: + * + * Fallback fast-track flag for single-stepping. + * Set by user-space if lack of progress is detected. + * Cleared by user-space after rseq finish. + * Read by the kernel. + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT + * Inhibit instruction sequence block restart and event + * counter increment on preemption for this thread. + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL + * Inhibit instruction sequence block restart and event + * counter increment on signal delivery for this thread. + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE + * Inhibit instruction sequence block restart and event + * counter increment on migration for this thread. + */ + __u32 flags; +} __attribute__((aligned(4 * sizeof(__u64)))); + +#endif /* _UAPI_LINUX_RSEQ_H */ diff --git a/init/Kconfig b/init/Kconfig index 18b151f0ddc1fba93777e1ca9f40b9f0fd072711..33ec06fddaaa21ebef9cdfade6e8cc455b9d90a9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1417,6 +1417,29 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS config ARCH_HAS_MEMBARRIER_SYNC_CORE bool +config RSEQ + bool "Enable rseq() system call" if EXPERT + default y + depends on HAVE_RSEQ + select MEMBARRIER + help + Enable the restartable sequences system call. It provides a + user-space cache for the current CPU number value, which + speeds up getting the current CPU number from user-space, + as well as an ABI to speed up user-space operations on + per-CPU data. + + If unsure, say Y. + +config DEBUG_RSEQ + default n + bool "Enabled debugging of rseq() system call" if EXPERT + depends on RSEQ && DEBUG_KERNEL + help + Enable extra debugging checks for the rseq system call. + + If unsure, say N. + config EMBEDDED bool "Embedded system" option allnoconfig_y diff --git a/kernel/Makefile b/kernel/Makefile index f85ae5dfa474008536c4ec3e088ffb5591c41a21..7085c841c413f1150ab69dc5a3453b2ba5c7b3f9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += memremap.o +obj-$(CONFIG_RSEQ) += rseq.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/fork.c b/kernel/fork.c index a5d21c42acfc8ca2d0e14a0969c9cdc9222fb42d..70992bfeba812cb97fb1b68c2a84b5ecfe1e9126 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1899,6 +1899,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + rseq_fork(p, clone_flags); + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the diff --git a/kernel/rseq.c b/kernel/rseq.c new file mode 100644 index 0000000000000000000000000000000000000000..ae306f90c51484fae6bb583733ca5e8f8b3e76be --- /dev/null +++ b/kernel/rseq.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Restartable sequences system call + * + * Copyright (C) 2015, Google, Inc., + * Paul Turner and Andrew Hunter + * Copyright (C) 2015-2018, EfficiOS Inc., + * Mathieu Desnoyers + */ + +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \ + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) + +/* + * + * Restartable sequences are a lightweight interface that allows + * user-level code to be executed atomically relative to scheduler + * preemption and signal delivery. Typically used for implementing + * per-cpu operations. + * + * It allows user-space to perform update operations on per-cpu data + * without requiring heavy-weight atomic operations. + * + * Detailed algorithm of rseq user-space assembly sequences: + * + * init(rseq_cs) + * cpu = TLS->rseq::cpu_id_start + * [1] TLS->rseq::rseq_cs = rseq_cs + * [start_ip] ---------------------------- + * [2] if (cpu != TLS->rseq::cpu_id) + * goto abort_ip; + * [3] + * [post_commit_ip] ---------------------------- + * + * The address of jump target abort_ip must be outside the critical + * region, i.e.: + * + * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] + * + * Steps [2]-[3] (inclusive) need to be a sequence of instructions in + * userspace that can handle being interrupted between any of those + * instructions, and then resumed to the abort_ip. + * + * 1. Userspace stores the address of the struct rseq_cs assembly + * block descriptor into the rseq_cs field of the registered + * struct rseq TLS area. This update is performed through a single + * store within the inline assembly instruction sequence. + * [start_ip] + * + * 2. Userspace tests to check whether the current cpu_id field match + * the cpu number loaded before start_ip, branching to abort_ip + * in case of a mismatch. + * + * If the sequence is preempted or interrupted by a signal + * at or after start_ip and before post_commit_ip, then the kernel + * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return + * ip to abort_ip before returning to user-space, so the preempted + * execution resumes at abort_ip. + * + * 3. Userspace critical section final instruction before + * post_commit_ip is the commit. The critical section is + * self-terminating. + * [post_commit_ip] + * + * 4. + * + * On failure at [2], or if interrupted by preempt or signal delivery + * between [1] and [3]: + * + * [abort_ip] + * F1. + */ + +static int rseq_update_cpu_id(struct task_struct *t) +{ + u32 cpu_id = raw_smp_processor_id(); + + if (__put_user(cpu_id, &t->rseq->cpu_id_start)) + return -EFAULT; + if (__put_user(cpu_id, &t->rseq->cpu_id)) + return -EFAULT; + trace_rseq_update(t); + return 0; +} + +static int rseq_reset_rseq_cpu_id(struct task_struct *t) +{ + u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + + /* + * Reset cpu_id_start to its initial state (0). + */ + if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) + return -EFAULT; + /* + * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming + * in after unregistration can figure out that rseq needs to be + * registered again. + */ + if (__put_user(cpu_id, &t->rseq->cpu_id)) + return -EFAULT; + return 0; +} + +static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +{ + struct rseq_cs __user *urseq_cs; + unsigned long ptr; + u32 __user *usig; + u32 sig; + int ret; + + ret = __get_user(ptr, &t->rseq->rseq_cs); + if (ret) + return ret; + if (!ptr) { + memset(rseq_cs, 0, sizeof(*rseq_cs)); + return 0; + } + urseq_cs = (struct rseq_cs __user *)ptr; + if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) + return -EFAULT; + if (rseq_cs->version > 0) + return -EINVAL; + + /* Ensure that abort_ip is not in the critical section. */ + if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) + return -EINVAL; + + usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + ret = get_user(sig, usig); + if (ret) + return ret; + + if (current->rseq_sig != sig) { + printk_ratelimited(KERN_WARNING + "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", + sig, current->rseq_sig, current->pid, usig); + return -EPERM; + } + return 0; +} + +static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +{ + u32 flags, event_mask; + int ret; + + /* Get thread flags. */ + ret = __get_user(flags, &t->rseq->flags); + if (ret) + return ret; + + /* Take critical section flags into account. */ + flags |= cs_flags; + + /* + * Restart on signal can only be inhibited when restart on + * preempt and restart on migrate are inhibited too. Otherwise, + * a preempted signal handler could fail to restart the prior + * execution context on sigreturn. + */ + if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) && + (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) != + RSEQ_CS_PREEMPT_MIGRATE_FLAGS)) + return -EINVAL; + + /* + * Load and clear event mask atomically with respect to + * scheduler preemption. + */ + preempt_disable(); + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + preempt_enable(); + + return !!(event_mask & ~flags); +} + +static int clear_rseq_cs(struct task_struct *t) +{ + /* + * The rseq_cs field is set to NULL on preemption or signal + * delivery on top of rseq assembly block, as well as on top + * of code outside of the rseq assembly block. This performs + * a lazy clear of the rseq_cs field. + * + * Set rseq_cs to NULL with single-copy atomicity. + */ + return __put_user(0UL, &t->rseq->rseq_cs); +} + +/* + * Unsigned comparison will be true when ip >= start_ip, and when + * ip < start_ip + post_commit_offset. + */ +static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +{ + return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; +} + +static int rseq_ip_fixup(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + struct task_struct *t = current; + struct rseq_cs rseq_cs; + int ret; + + ret = rseq_get_rseq_cs(t, &rseq_cs); + if (ret) + return ret; + + /* + * Handle potentially not being within a critical section. + * If not nested over a rseq critical section, restart is useless. + * Clear the rseq_cs pointer and return. + */ + if (!in_rseq_cs(ip, &rseq_cs)) + return clear_rseq_cs(t); + ret = rseq_need_restart(t, rseq_cs.flags); + if (ret <= 0) + return ret; + ret = clear_rseq_cs(t); + if (ret) + return ret; + trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, + rseq_cs.abort_ip); + instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); + return 0; +} + +/* + * This resume handler must always be executed between any of: + * - preemption, + * - signal delivery, + * and return to user-space. + * + * This is how we can ensure that the entire rseq critical section, + * consisting of both the C part and the assembly instruction sequence, + * will issue the commit instruction only if executed atomically with + * respect to other threads scheduled on the same CPU, and with respect + * to signal handlers. + */ +void __rseq_handle_notify_resume(struct pt_regs *regs) +{ + struct task_struct *t = current; + int ret; + + if (unlikely(t->flags & PF_EXITING)) + return; + if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) + goto error; + ret = rseq_ip_fixup(regs); + if (unlikely(ret < 0)) + goto error; + if (unlikely(rseq_update_cpu_id(t))) + goto error; + return; + +error: + force_sig(SIGSEGV, t); +} + +#ifdef CONFIG_DEBUG_RSEQ + +/* + * Terminate the process if a syscall is issued within a restartable + * sequence. + */ +void rseq_syscall(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + struct task_struct *t = current; + struct rseq_cs rseq_cs; + + if (!t->rseq) + return; + if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || + rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) + force_sig(SIGSEGV, t); +} + +#endif + +/* + * sys_rseq - setup restartable sequences for caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, + int, flags, u32, sig) +{ + int ret; + + if (flags & RSEQ_FLAG_UNREGISTER) { + /* Unregister rseq for current thread. */ + if (current->rseq != rseq || !current->rseq) + return -EINVAL; + if (current->rseq_len != rseq_len) + return -EINVAL; + if (current->rseq_sig != sig) + return -EPERM; + ret = rseq_reset_rseq_cpu_id(current); + if (ret) + return ret; + current->rseq = NULL; + current->rseq_len = 0; + current->rseq_sig = 0; + return 0; + } + + if (unlikely(flags)) + return -EINVAL; + + if (current->rseq) { + /* + * If rseq is already registered, check whether + * the provided address differs from the prior + * one. + */ + if (current->rseq != rseq || current->rseq_len != rseq_len) + return -EINVAL; + if (current->rseq_sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; + } + + /* + * If there was no rseq previously registered, + * ensure the provided rseq is properly aligned and valid. + */ + if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + rseq_len != sizeof(*rseq)) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) + return -EFAULT; + current->rseq = rseq; + current->rseq_len = rseq_len; + current->rseq_sig = sig; + /* + * If rseq was previously inactive, and has just been + * registered, ensure the cpu_id_start and cpu_id fields + * are updated before returning to user-space. + */ + rseq_set_notify_resume(current); + + return 0; +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9866f86f3048652babd97009d36b07ba6bab531..a98d54cd553502cb3f983416c0aae0fdac3d6a17 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1191,6 +1191,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; + rseq_migrate(p); perf_event_task_migrate(p); } @@ -2634,6 +2635,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, { sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); + rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); prepare_task(next); prepare_arch_switch(next); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 183169c2a75b6a522b6d4aa83128df223b8cb751..86f832d6ff6f5a8b9520a9f0dc2af8d9f81a17d0 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -432,3 +432,6 @@ COND_SYSCALL(setresgid16); COND_SYSCALL(setresuid16); COND_SYSCALL(setreuid16); COND_SYSCALL(setuid16); + +/* restartable sequence */ +COND_SYSCALL(rseq);