未验证 提交 1f74be50 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!419 Backport CVEs and bugfixes

Merge Pull Request from: @zhangjialin11 
 
Pull new CVEs:
CVE-2023-26545
CVE-2023-0045
CVE-2023-20938
CVE-2023-0240

rcu bugfix from Zheng Yejian
net bugfixes from Zhengchao Shao
block bugfix from Zhong Jinghua
md/raid10 bugfixes from Li Nan
arm/kasan bugfix from Longlong Xia 
 
Link:https://gitee.com/openeuler/kernel/pulls/419 

Reviewed-by: Zheng Zengkai <zhengzengkai@huawei.com> 
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com> 
......@@ -1129,7 +1129,7 @@ export MODORDER := $(extmod-prefix)modules.order
export MODULES_NSDEPS := $(extmod-prefix)modules.nsdeps
ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ io_uring/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
......
......@@ -137,6 +137,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
* thread information flags:
* TIF_USEDFPU - FPU was used by this task this quantum (SMP)
* TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED
*
* Any bit in the range of 0..15 will cause do_work_pending() to be invoked.
*/
#define TIF_SIGPENDING 0 /* signal pending */
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
......@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
#define TIF_PATCH_PENDING 8 /* pending live patching update */
#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */
#define TIF_USING_IWMMXT 17
#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
......@@ -162,6 +165,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
/* Checks for any syscall work in entry-common.S */
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
......@@ -171,7 +175,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
* Change these and you break ASM code in entry-common.S
*/
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE)
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NOTIFY_SIGNAL)
#endif /* __KERNEL__ */
#endif /* __ASM_ARM_THREAD_INFO_H */
......@@ -54,7 +54,7 @@ __ret_fast_syscall:
cmp r2, r1
blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
movs r1, r1, lsl #16
bne fast_work_pending
......@@ -92,7 +92,7 @@ __ret_fast_syscall:
cmp r2, r1
blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
movs r1, r1, lsl #16
beq no_work_pending
UNWIND(.fnend )
ENDPROC(ret_fast_syscall)
......@@ -134,7 +134,7 @@ ENTRY(ret_to_user_from_irq)
cmp r2, r1
blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS]
tst r1, #_TIF_WORK_MASK
movs r1, r1, lsl #16
bne slow_work_pending
no_work_pending:
asm_trace_hardirqs_on save = 0
......
......@@ -59,7 +59,7 @@ __irq_entry:
get_thread_info tsk
ldr r2, [tsk, #TI_FLAGS]
tst r2, #_TIF_WORK_MASK
movs r2, r2, lsl #16
beq 2f @ no work pending
mov r0, #V7M_SCB_ICSR_PENDSVSET
str r0, [r1, V7M_SCB_ICSR] @ raise PendSV
......
......@@ -655,7 +655,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
if (unlikely(!user_mode(regs)))
return 0;
local_irq_enable();
if (thread_flags & _TIF_SIGPENDING) {
if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
int restart = do_signal(regs, syscall);
if (unlikely(restart)) {
/*
......
......@@ -264,12 +264,17 @@ void __init kasan_init(void)
/*
* 1. The module global variables are in MODULES_VADDR ~ MODULES_END,
* so we need to map this area.
* so we need to map this area if CONFIG_KASAN_VMALLOC=n. With
* VMALLOC support KASAN will manage this region dynamically,
* refer to kasan_populate_vmalloc() and ARM's implementation of
* module_alloc().
* 2. PKMAP_BASE ~ PKMAP_BASE+PMD_SIZE's shadow and MODULES_VADDR
* ~ MODULES_END's shadow is in the same PMD_SIZE, so we can't
* use kasan_populate_zero_shadow.
*/
create_mapping((void *)MODULES_VADDR, (void *)(PKMAP_BASE + PMD_SIZE));
if (!IS_ENABLED(CONFIG_KASAN_VMALLOC) && IS_ENABLED(CONFIG_MODULES))
create_mapping((void *)MODULES_VADDR, (void *)(MODULES_END));
create_mapping((void *)PKMAP_BASE, (void *)(PKMAP_BASE + PMD_SIZE));
/*
* KAsan may reuse the contents of kasan_early_shadow_pte directly, so
......
......@@ -69,6 +69,7 @@ void arch_release_task_struct(struct task_struct *tsk);
#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */
#define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */
#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
#define TIF_SYSCALL_AUDIT 9 /* syscall auditing */
#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */
......@@ -101,13 +102,15 @@ void arch_release_task_struct(struct task_struct *tsk);
#define _TIF_32BIT (1 << TIF_32BIT)
#define _TIF_SVE (1 << TIF_SVE)
#define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_32BIT_AARCH64 (1 << TIF_32BIT_AARCH64)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
_TIF_UPROBE | _TIF_MTE_ASYNC_FAULT)
_TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
......
......@@ -711,7 +711,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
(void __user *)NULL, current);
}
if (thread_flags & _TIF_SIGPENDING)
if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
do_signal(regs);
if (thread_flags & _TIF_NOTIFY_RESUME) {
......
......@@ -99,6 +99,7 @@ void arch_setup_new_exec(void);
#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
#define TIF_SIGPENDING 1 /* signal pending */
#define TIF_NEED_RESCHED 2 /* rescheduling necessary */
#define TIF_NOTIFY_SIGNAL 3 /* signal notifications exist */
#define TIF_SYSCALL_EMU 4 /* syscall emulation active */
#define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
#define TIF_PATCH_PENDING 6 /* pending live patching update */
......@@ -124,6 +125,7 @@ void arch_setup_new_exec(void);
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
#define _TIF_NOTIFY_SIGNAL (1<<TIF_NOTIFY_SIGNAL)
#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
#define _TIF_32BIT (1<<TIF_32BIT)
#define _TIF_RESTORE_TM (1<<TIF_RESTORE_TM)
......@@ -145,7 +147,8 @@ void arch_setup_new_exec(void);
#define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_RESTORE_TM | _TIF_PATCH_PENDING)
_TIF_RESTORE_TM | _TIF_PATCH_PENDING | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
/* Bits in local_flags */
......
......@@ -318,7 +318,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
if (thread_info_flags & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
if (thread_info_flags & _TIF_SIGPENDING) {
if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
BUG_ON(regs != current->thread.regs);
do_signal(current);
}
......
......@@ -80,6 +80,7 @@ struct thread_info {
#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing */
#define TIF_SECCOMP 8 /* syscall secure computing */
#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
......@@ -88,9 +89,11 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_WORK_MASK \
(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_SYSCALL_WORK \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
......
......@@ -310,7 +310,7 @@ asmlinkage __visible void do_notify_resume(struct pt_regs *regs,
unsigned long thread_info_flags)
{
/* Handle pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
do_signal(regs);
if (thread_info_flags & _TIF_NOTIFY_RESUME)
......
......@@ -99,6 +99,7 @@ struct thread_info {
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
#define TIF_SLD 18 /* Restore split lock detection on context switch */
#define TIF_NOTIFY_SIGNAL 19 /* signal notifications exist */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
......@@ -127,6 +128,7 @@ struct thread_info {
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_SLD (1 << TIF_SLD)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
......
......@@ -1889,6 +1889,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
task_update_spec_tif(task);
if (task == current)
indirect_branch_prediction_barrier();
break;
default:
return -ERANGE;
......
......@@ -798,11 +798,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
void arch_do_signal(struct pt_regs *regs)
void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
{
struct ksignal ksig;
if (get_signal(&ksig)) {
if (has_signal && get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
handle_signal(&ksig, regs);
return;
......
......@@ -2008,15 +2008,21 @@ static void binder_cleanup_transaction(struct binder_transaction *t,
/**
* binder_get_object() - gets object and checks for valid metadata
* @proc: binder_proc owning the buffer
* @u: sender's user pointer to base of buffer
* @buffer: binder_buffer that we're parsing.
* @offset: offset in the @buffer at which to validate an object.
* @object: struct binder_object to read into
*
* Return: If there's a valid metadata object at @offset in @buffer, the
* Copy the binder object at the given offset into @object. If @u is
* provided then the copy is from the sender's buffer. If not, then
* it is copied from the target's @buffer.
*
* Return: If there's a valid metadata object at @offset, the
* size of that object. Otherwise, it returns zero. The object
* is read into the struct binder_object pointed to by @object.
*/
static size_t binder_get_object(struct binder_proc *proc,
const void __user *u,
struct binder_buffer *buffer,
unsigned long offset,
struct binder_object *object)
......@@ -2026,10 +2032,16 @@ static size_t binder_get_object(struct binder_proc *proc,
size_t object_size = 0;
read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset);
if (offset > buffer->data_size || read_size < sizeof(*hdr) ||
binder_alloc_copy_from_buffer(&proc->alloc, object, buffer,
offset, read_size))
if (offset > buffer->data_size || read_size < sizeof(*hdr))
return 0;
if (u) {
if (copy_from_user(object, u + offset, read_size))
return 0;
} else {
if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer,
offset, read_size))
return 0;
}
/* Ok, now see if we read a complete object. */
hdr = &object->hdr;
......@@ -2102,7 +2114,7 @@ static struct binder_buffer_object *binder_validate_ptr(
b, buffer_offset,
sizeof(object_offset)))
return NULL;
object_size = binder_get_object(proc, b, object_offset, object);
object_size = binder_get_object(proc, NULL, b, object_offset, object);
if (!object_size || object->hdr.type != BINDER_TYPE_PTR)
return NULL;
if (object_offsetp)
......@@ -2167,7 +2179,8 @@ static bool binder_validate_fixup(struct binder_proc *proc,
unsigned long buffer_offset;
struct binder_object last_object;
struct binder_buffer_object *last_bbo;
size_t object_size = binder_get_object(proc, b, last_obj_offset,
size_t object_size = binder_get_object(proc, NULL, b,
last_obj_offset,
&last_object);
if (object_size != sizeof(*last_bbo))
return false;
......@@ -2242,7 +2255,7 @@ static void binder_deferred_fd_close(int fd)
if (!twcb)
return;
init_task_work(&twcb->twork, binder_do_fd_close);
__close_fd_get_file(fd, &twcb->file);
close_fd_get_file(fd, &twcb->file);
if (twcb->file) {
filp_close(twcb->file, current->files);
task_work_add(current, &twcb->twork, TWA_RESUME);
......@@ -2282,7 +2295,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset,
buffer, buffer_offset,
sizeof(object_offset)))
object_size = binder_get_object(proc, buffer,
object_size = binder_get_object(proc, NULL, buffer,
object_offset, &object);
if (object_size == 0) {
pr_err("transaction release %d bad object at offset %lld, size %zd\n",
......@@ -2620,16 +2633,266 @@ static int binder_translate_fd(u32 fd, binder_size_t fd_offset,
return ret;
}
static int binder_translate_fd_array(struct binder_fd_array_object *fda,
/**
* struct binder_ptr_fixup - data to be fixed-up in target buffer
* @offset offset in target buffer to fixup
* @skip_size bytes to skip in copy (fixup will be written later)
* @fixup_data data to write at fixup offset
* @node list node
*
* This is used for the pointer fixup list (pf) which is created and consumed
* during binder_transaction() and is only accessed locally. No
* locking is necessary.
*
* The list is ordered by @offset.
*/
struct binder_ptr_fixup {
binder_size_t offset;
size_t skip_size;
binder_uintptr_t fixup_data;
struct list_head node;
};
/**
* struct binder_sg_copy - scatter-gather data to be copied
* @offset offset in target buffer
* @sender_uaddr user address in source buffer
* @length bytes to copy
* @node list node
*
* This is used for the sg copy list (sgc) which is created and consumed
* during binder_transaction() and is only accessed locally. No
* locking is necessary.
*
* The list is ordered by @offset.
*/
struct binder_sg_copy {
binder_size_t offset;
const void __user *sender_uaddr;
size_t length;
struct list_head node;
};
/**
* binder_do_deferred_txn_copies() - copy and fixup scatter-gather data
* @alloc: binder_alloc associated with @buffer
* @buffer: binder buffer in target process
* @sgc_head: list_head of scatter-gather copy list
* @pf_head: list_head of pointer fixup list
*
* Processes all elements of @sgc_head, applying fixups from @pf_head
* and copying the scatter-gather data from the source process' user
* buffer to the target's buffer. It is expected that the list creation
* and processing all occurs during binder_transaction() so these lists
* are only accessed in local context.
*
* Return: 0=success, else -errno
*/
static int binder_do_deferred_txn_copies(struct binder_alloc *alloc,
struct binder_buffer *buffer,
struct list_head *sgc_head,
struct list_head *pf_head)
{
int ret = 0;
struct binder_sg_copy *sgc, *tmpsgc;
struct binder_ptr_fixup *tmppf;
struct binder_ptr_fixup *pf =
list_first_entry_or_null(pf_head, struct binder_ptr_fixup,
node);
list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) {
size_t bytes_copied = 0;
while (bytes_copied < sgc->length) {
size_t copy_size;
size_t bytes_left = sgc->length - bytes_copied;
size_t offset = sgc->offset + bytes_copied;
/*
* We copy up to the fixup (pointed to by pf)
*/
copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset)
: bytes_left;
if (!ret && copy_size)
ret = binder_alloc_copy_user_to_buffer(
alloc, buffer,
offset,
sgc->sender_uaddr + bytes_copied,
copy_size);
bytes_copied += copy_size;
if (copy_size != bytes_left) {
BUG_ON(!pf);
/* we stopped at a fixup offset */
if (pf->skip_size) {
/*
* we are just skipping. This is for
* BINDER_TYPE_FDA where the translated
* fds will be fixed up when we get
* to target context.
*/
bytes_copied += pf->skip_size;
} else {
/* apply the fixup indicated by pf */
if (!ret)
ret = binder_alloc_copy_to_buffer(
alloc, buffer,
pf->offset,
&pf->fixup_data,
sizeof(pf->fixup_data));
bytes_copied += sizeof(pf->fixup_data);
}
list_del(&pf->node);
kfree(pf);
pf = list_first_entry_or_null(pf_head,
struct binder_ptr_fixup, node);
}
}
list_del(&sgc->node);
kfree(sgc);
}
list_for_each_entry_safe(pf, tmppf, pf_head, node) {
BUG_ON(pf->skip_size == 0);
list_del(&pf->node);
kfree(pf);
}
BUG_ON(!list_empty(sgc_head));
return ret > 0 ? -EINVAL : ret;
}
/**
* binder_cleanup_deferred_txn_lists() - free specified lists
* @sgc_head: list_head of scatter-gather copy list
* @pf_head: list_head of pointer fixup list
*
* Called to clean up @sgc_head and @pf_head if there is an
* error.
*/
static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head,
struct list_head *pf_head)
{
struct binder_sg_copy *sgc, *tmpsgc;
struct binder_ptr_fixup *pf, *tmppf;
list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) {
list_del(&sgc->node);
kfree(sgc);
}
list_for_each_entry_safe(pf, tmppf, pf_head, node) {
list_del(&pf->node);
kfree(pf);
}
}
/**
* binder_defer_copy() - queue a scatter-gather buffer for copy
* @sgc_head: list_head of scatter-gather copy list
* @offset: binder buffer offset in target process
* @sender_uaddr: user address in source process
* @length: bytes to copy
*
* Specify a scatter-gather block to be copied. The actual copy must
* be deferred until all the needed fixups are identified and queued.
* Then the copy and fixups are done together so un-translated values
* from the source are never visible in the target buffer.
*
* We are guaranteed that repeated calls to this function will have
* monotonically increasing @offset values so the list will naturally
* be ordered.
*
* Return: 0=success, else -errno
*/
static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset,
const void __user *sender_uaddr, size_t length)
{
struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL);
if (!bc)
return -ENOMEM;
bc->offset = offset;
bc->sender_uaddr = sender_uaddr;
bc->length = length;
INIT_LIST_HEAD(&bc->node);
/*
* We are guaranteed that the deferred copies are in-order
* so just add to the tail.
*/
list_add_tail(&bc->node, sgc_head);
return 0;
}
/**
* binder_add_fixup() - queue a fixup to be applied to sg copy
* @pf_head: list_head of binder ptr fixup list
* @offset: binder buffer offset in target process
* @fixup: bytes to be copied for fixup
* @skip_size: bytes to skip when copying (fixup will be applied later)
*
* Add the specified fixup to a list ordered by @offset. When copying
* the scatter-gather buffers, the fixup will be copied instead of
* data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup
* will be applied later (in target process context), so we just skip
* the bytes specified by @skip_size. If @skip_size is 0, we copy the
* value in @fixup.
*
* This function is called *mostly* in @offset order, but there are
* exceptions. Since out-of-order inserts are relatively uncommon,
* we insert the new element by searching backward from the tail of
* the list.
*
* Return: 0=success, else -errno
*/
static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset,
binder_uintptr_t fixup, size_t skip_size)
{
struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL);
struct binder_ptr_fixup *tmppf;
if (!pf)
return -ENOMEM;
pf->offset = offset;
pf->fixup_data = fixup;
pf->skip_size = skip_size;
INIT_LIST_HEAD(&pf->node);
/* Fixups are *mostly* added in-order, but there are some
* exceptions. Look backwards through list for insertion point.
*/
list_for_each_entry_reverse(tmppf, pf_head, node) {
if (tmppf->offset < pf->offset) {
list_add(&pf->node, &tmppf->node);
return 0;
}
}
/*
* if we get here, then the new offset is the lowest so
* insert at the head
*/
list_add(&pf->node, pf_head);
return 0;
}
static int binder_translate_fd_array(struct list_head *pf_head,
struct binder_fd_array_object *fda,
const void __user *sender_ubuffer,
struct binder_buffer_object *parent,
struct binder_buffer_object *sender_uparent,
struct binder_transaction *t,
struct binder_thread *thread,
struct binder_transaction *in_reply_to)
{
binder_size_t fdi, fd_buf_size;
binder_size_t fda_offset;
const void __user *sender_ufda_base;
struct binder_proc *proc = thread->proc;
struct binder_proc *target_proc = t->to_proc;
int ret;
if (fda->num_fds == 0)
return 0;
fd_buf_size = sizeof(u32) * fda->num_fds;
if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
......@@ -2653,19 +2916,25 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda,
*/
fda_offset = (parent->buffer - (uintptr_t)t->buffer->user_data) +
fda->parent_offset;
if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32))) {
sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer +
fda->parent_offset;
if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) ||
!IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) {
binder_user_error("%d:%d parent offset not aligned correctly.\n",
proc->pid, thread->pid);
return -EINVAL;
}
ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32));
if (ret)
return ret;
for (fdi = 0; fdi < fda->num_fds; fdi++) {
u32 fd;
int ret;
binder_size_t offset = fda_offset + fdi * sizeof(fd);
binder_size_t sender_uoffset = fdi * sizeof(fd);
ret = binder_alloc_copy_from_buffer(&target_proc->alloc,
&fd, t->buffer,
offset, sizeof(fd));
ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd));
if (!ret)
ret = binder_translate_fd(fd, offset, t, thread,
in_reply_to);
......@@ -2675,7 +2944,8 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda,
return 0;
}
static int binder_fixup_parent(struct binder_transaction *t,
static int binder_fixup_parent(struct list_head *pf_head,
struct binder_transaction *t,
struct binder_thread *thread,
struct binder_buffer_object *bp,
binder_size_t off_start_offset,
......@@ -2721,14 +2991,7 @@ static int binder_fixup_parent(struct binder_transaction *t,
}
buffer_offset = bp->parent_offset +
(uintptr_t)parent->buffer - (uintptr_t)b->user_data;
if (binder_alloc_copy_to_buffer(&target_proc->alloc, b, buffer_offset,
&bp->buffer, sizeof(bp->buffer))) {
binder_user_error("%d:%d got transaction with invalid parent offset\n",
proc->pid, thread->pid);
return -EINVAL;
}
return 0;
return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0);
}
/**
......@@ -2848,6 +3111,7 @@ static void binder_transaction(struct binder_proc *proc,
binder_size_t off_start_offset, off_end_offset;
binder_size_t off_min;
binder_size_t sg_buf_offset, sg_buf_end_offset;
binder_size_t user_offset = 0;
struct binder_proc *target_proc = NULL;
struct binder_thread *target_thread = NULL;
struct binder_node *target_node = NULL;
......@@ -2862,6 +3126,12 @@ static void binder_transaction(struct binder_proc *proc,
int t_debug_id = atomic_inc_return(&binder_last_id);
char *secctx = NULL;
u32 secctx_sz = 0;
struct list_head sgc_head;
struct list_head pf_head;
const void __user *user_buffer = (const void __user *)
(uintptr_t)tr->data.ptr.buffer;
INIT_LIST_HEAD(&sgc_head);
INIT_LIST_HEAD(&pf_head);
e = binder_transaction_log_add(&binder_transaction_log);
e->debug_id = t_debug_id;
......@@ -3173,19 +3443,6 @@ static void binder_transaction(struct binder_proc *proc,
t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF);
trace_binder_transaction_alloc_buf(t->buffer);
if (binder_alloc_copy_user_to_buffer(
&target_proc->alloc,
t->buffer, 0,
(const void __user *)
(uintptr_t)tr->data.ptr.buffer,
tr->data_size)) {
binder_user_error("%d:%d got transaction with invalid data ptr\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
return_error_param = -EFAULT;
return_error_line = __LINE__;
goto err_copy_data_failed;
}
if (binder_alloc_copy_user_to_buffer(
&target_proc->alloc,
t->buffer,
......@@ -3230,6 +3487,7 @@ static void binder_transaction(struct binder_proc *proc,
size_t object_size;
struct binder_object object;
binder_size_t object_offset;
binder_size_t copy_size;
if (binder_alloc_copy_from_buffer(&target_proc->alloc,
&object_offset,
......@@ -3241,8 +3499,27 @@ static void binder_transaction(struct binder_proc *proc,
return_error_line = __LINE__;
goto err_bad_offset;
}
object_size = binder_get_object(target_proc, t->buffer,
object_offset, &object);
/*
* Copy the source user buffer up to the next object
* that will be processed.
*/
copy_size = object_offset - user_offset;
if (copy_size && (user_offset > object_offset ||
binder_alloc_copy_user_to_buffer(
&target_proc->alloc,
t->buffer, user_offset,
user_buffer + user_offset,
copy_size))) {
binder_user_error("%d:%d got transaction with invalid data ptr\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
return_error_param = -EFAULT;
return_error_line = __LINE__;
goto err_copy_data_failed;
}
object_size = binder_get_object(target_proc, user_buffer,
t->buffer, object_offset, &object);
if (object_size == 0 || object_offset < off_min) {
binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n",
proc->pid, thread->pid,
......@@ -3254,6 +3531,11 @@ static void binder_transaction(struct binder_proc *proc,
return_error_line = __LINE__;
goto err_bad_offset;
}
/*
* Set offset to the next buffer fragment to be
* copied
*/
user_offset = object_offset + object_size;
hdr = &object.hdr;
off_min = object_offset + object_size;
......@@ -3316,6 +3598,8 @@ static void binder_transaction(struct binder_proc *proc,
case BINDER_TYPE_FDA: {
struct binder_object ptr_object;
binder_size_t parent_offset;
struct binder_object user_object;
size_t user_parent_size;
struct binder_fd_array_object *fda =
to_binder_fd_array_object(hdr);
size_t num_valid = (buffer_offset - off_start_offset) /
......@@ -3347,11 +3631,35 @@ static void binder_transaction(struct binder_proc *proc,
return_error_line = __LINE__;
goto err_bad_parent;
}
ret = binder_translate_fd_array(fda, parent, t, thread,
in_reply_to);
if (ret < 0) {
/*
* We need to read the user version of the parent
* object to get the original user offset
*/
user_parent_size =
binder_get_object(proc, user_buffer, t->buffer,
parent_offset, &user_object);
if (user_parent_size != sizeof(user_object.bbo)) {
binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n",
proc->pid, thread->pid,
user_parent_size,
sizeof(user_object.bbo));
return_error = BR_FAILED_REPLY;
return_error_param = -EINVAL;
return_error_line = __LINE__;
goto err_bad_parent;
}
ret = binder_translate_fd_array(&pf_head, fda,
user_buffer, parent,
&user_object.bbo, t,
thread, in_reply_to);
if (!ret)
ret = binder_alloc_copy_to_buffer(&target_proc->alloc,
t->buffer,
object_offset,
fda, sizeof(*fda));
if (ret) {
return_error = BR_FAILED_REPLY;
return_error_param = ret;
return_error_param = ret > 0 ? -EINVAL : ret;
return_error_line = __LINE__;
goto err_translate_failed;
}
......@@ -3373,19 +3681,14 @@ static void binder_transaction(struct binder_proc *proc,
return_error_line = __LINE__;
goto err_bad_offset;
}
if (binder_alloc_copy_user_to_buffer(
&target_proc->alloc,
t->buffer,
sg_buf_offset,
(const void __user *)
(uintptr_t)bp->buffer,
bp->length)) {
binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
proc->pid, thread->pid);
return_error_param = -EFAULT;
ret = binder_defer_copy(&sgc_head, sg_buf_offset,
(const void __user *)(uintptr_t)bp->buffer,
bp->length);
if (ret) {
return_error = BR_FAILED_REPLY;
return_error_param = ret;
return_error_line = __LINE__;
goto err_copy_data_failed;
goto err_translate_failed;
}
/* Fixup buffer pointer to target proc address space */
bp->buffer = (uintptr_t)
......@@ -3394,7 +3697,8 @@ static void binder_transaction(struct binder_proc *proc,
num_valid = (buffer_offset - off_start_offset) /
sizeof(binder_size_t);
ret = binder_fixup_parent(t, thread, bp,
ret = binder_fixup_parent(&pf_head, t,
thread, bp,
off_start_offset,
num_valid,
last_fixup_obj_off,
......@@ -3421,6 +3725,30 @@ static void binder_transaction(struct binder_proc *proc,
goto err_bad_object_type;
}
}
/* Done processing objects, copy the rest of the buffer */
if (binder_alloc_copy_user_to_buffer(
&target_proc->alloc,
t->buffer, user_offset,
user_buffer + user_offset,
tr->data_size - user_offset)) {
binder_user_error("%d:%d got transaction with invalid data ptr\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
return_error_param = -EFAULT;
return_error_line = __LINE__;
goto err_copy_data_failed;
}
ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer,
&sgc_head, &pf_head);
if (ret) {
binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
return_error_param = ret;
return_error_line = __LINE__;
goto err_copy_data_failed;
}
tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
t->work.type = BINDER_WORK_TRANSACTION;
......@@ -3487,6 +3815,7 @@ static void binder_transaction(struct binder_proc *proc,
err_bad_offset:
err_bad_parent:
err_copy_data_failed:
binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head);
binder_free_txn_fixups(t);
trace_binder_transaction_failed_buffer_release(t->buffer);
binder_transaction_buffer_release(target_proc, NULL, t->buffer,
......
......@@ -1771,7 +1771,6 @@ static int nbd_dev_add(int index)
struct gendisk *disk;
struct request_queue *q;
int err = -ENOMEM;
int first_minor = index << part_shift;
nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
if (!nbd)
......@@ -1835,7 +1834,7 @@ static int nbd_dev_add(int index)
refcount_set(&nbd->refs, 1);
INIT_LIST_HEAD(&nbd->list);
disk->major = NBD_MAJOR;
disk->first_minor = first_minor;
disk->first_minor = index << part_shift;
disk->fops = &nbd_fops;
disk->private_data = nbd;
sprintf(disk->disk_name, "nbd%d", index);
......
......@@ -1365,6 +1365,9 @@ __acquires(bitmap->lock)
sector_t csize;
int err;
if (page >= bitmap->pages)
return NULL;
err = md_bitmap_checkpage(bitmap, page, create, 0);
if (bitmap->bp[page].hijacked ||
......
......@@ -3846,35 +3846,51 @@ static int analyze_sbs(struct mddev *mddev)
*/
int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
{
unsigned long result = 0;
long decimals = -1;
while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
if (*cp == '.')
decimals = 0;
else if (decimals < scale) {
unsigned int value;
value = *cp - '0';
result = result * 10 + value;
if (decimals >= 0)
decimals++;
}
cp++;
}
if (*cp == '\n')
cp++;
if (*cp)
unsigned long result = 0, decimals = 0;
char *pos, *str;
int rv;
str = kmemdup_nul(cp, strlen(cp), GFP_KERNEL);
if (!str)
return -ENOMEM;
pos = strchr(str, '.');
if (pos) {
int cnt = scale;
*pos = '\0';
while (isdigit(*(++pos))) {
if (cnt) {
decimals = decimals * 10 + *pos - '0';
cnt--;
}
}
if (*pos == '\n')
pos++;
if (*pos) {
kfree(str);
return -EINVAL;
}
decimals *= int_pow(10, cnt);
}
rv = kstrtoul(str, 10, &result);
kfree(str);
if (rv)
return rv;
if (result > (ULONG_MAX - decimals) / (unsigned int)int_pow(10, scale))
return -EINVAL;
if (decimals < 0)
decimals = 0;
*res = result * int_pow(10, scale - decimals);
return 0;
*res = result * int_pow(10, scale) + decimals;
return rv;
}
static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
int msec = (mddev->safemode_delay*1000)/HZ;
return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
}
static ssize_t
safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
......@@ -3888,10 +3904,14 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL;
if (msec > UINT_MAX)
return -EINVAL;
if (msec == 0)
mddev->safemode_delay = 0;
else {
unsigned long old_delay = mddev->safemode_delay;
/* HZ <= 1000, so new_delay < UINT_MAX, too */
unsigned long new_delay = (msec*HZ)/1000;
if (new_delay == 0)
......@@ -4543,7 +4563,7 @@ __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_stor
static ssize_t
max_corrected_read_errors_show(struct mddev *mddev, char *page) {
return sprintf(page, "%d\n",
return sprintf(page, "%u\n",
atomic_read(&mddev->max_corr_read_errors));
}
......
......@@ -2325,7 +2325,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors;
struct md_rdev *rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
unsigned int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
/* still own a reference to this rdev, so it cannot
......@@ -2344,7 +2344,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
char b[BDEVNAME_SIZE];
bdevname(rdev->bdev, b);
pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %u:max %u]\n",
mdname(mddev), b,
atomic_read(&rdev->read_errors), max_read_errors);
pr_notice("md/raid10:%s: %s: Failing raid device\n",
......
......@@ -32,8 +32,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
......
......@@ -520,7 +520,7 @@ static bool dump_interrupted(void)
* but then we need to teach dump_write() to restart and clear
* TIF_SIGPENDING.
*/
return signal_pending(current);
return fatal_signal_pending(current) || freezing(current);
}
static void wait_for_dump_helpers(struct file *file)
......
......@@ -23,6 +23,8 @@
#include <linux/close_range.h>
#include <net/sock.h>
#include "internal.h"
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
......@@ -829,9 +831,8 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
}
/*
* variant of __close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
* See close_fd_get_file() below, this variant assumes current->files->file_lock
* is held.
*/
int __close_fd_get_file(unsigned int fd, struct file **res)
{
......@@ -839,26 +840,39 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
struct file *file;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
goto out_unlock;
goto out_err;
file = fdt->fd[fd];
if (!file)
goto out_unlock;
goto out_err;
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
get_file(file);
*res = file;
return 0;
out_unlock:
spin_unlock(&files->file_lock);
out_err:
*res = NULL;
return -ENOENT;
}
/*
* variant of close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
int close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
int ret;
spin_lock(&files->file_lock);
ret = __close_fd_get_file(fd, res);
spin_unlock(&files->file_lock);
return ret;
}
void do_close_on_exec(struct files_struct *files)
{
unsigned i;
......
......@@ -77,6 +77,8 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
long do_rmdir(int dfd, struct filename *name);
long do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
/*
* namespace.c
......@@ -132,6 +134,7 @@ extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
extern int __close_fd_get_file(unsigned int fd, struct file **res);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
......
......@@ -533,6 +533,8 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
p->stack = p->internal;
p->dfd = dfd;
p->name = name;
p->path.mnt = NULL;
p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
p->saved = old;
p->state = 0;
......@@ -607,6 +609,8 @@ static void terminate_walk(struct nameidata *nd)
rcu_read_unlock();
}
nd->depth = 0;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
}
/* path_put is needed afterwards regardless of success or failure */
......@@ -635,6 +639,11 @@ static inline bool legitimize_path(struct nameidata *nd,
static bool legitimize_links(struct nameidata *nd)
{
int i;
if (unlikely(nd->flags & LOOKUP_CACHED)) {
drop_links(nd);
nd->depth = 0;
return false;
}
for (i = 0; i < nd->depth; i++) {
struct saved *last = nd->stack + i;
if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
......@@ -798,6 +807,7 @@ static int complete_walk(struct nameidata *nd)
if (!(nd->state & ND_ROOT_PRESET))
if (!(nd->flags & LOOKUP_IS_SCOPED))
nd->root.mnt = NULL;
nd->flags &= ~LOOKUP_CACHED;
if (!try_to_unlazy(nd))
return -ECHILD;
}
......@@ -2210,6 +2220,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
int error;
const char *s = nd->name->name;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
return ERR_PTR(-EAGAIN);
if (!*s)
flags &= ~LOOKUP_RCU;
if (flags & LOOKUP_RCU)
......@@ -2240,8 +2254,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
nd->root.mnt = NULL;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
......@@ -4353,8 +4365,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
EXPORT_SYMBOL(vfs_rename);
static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
const char __user *newname, unsigned int flags)
int do_renameat2(int olddfd, struct filename *from, int newdfd,
struct filename *to, unsigned int flags)
{
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
......@@ -4362,32 +4374,30 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
struct qstr old_last, new_last;
int old_type, new_type;
struct inode *delegated_inode = NULL;
struct filename *from;
struct filename *to;
unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
bool should_retry = false;
int error;
int error = -EINVAL;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
goto put_both;
if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
(flags & RENAME_EXCHANGE))
return -EINVAL;
goto put_both;
if (flags & RENAME_EXCHANGE)
target_flags = 0;
retry:
from = filename_parentat(olddfd, getname(oldname), lookup_flags,
&old_path, &old_last, &old_type);
from = filename_parentat(olddfd, from, lookup_flags, &old_path,
&old_last, &old_type);
if (IS_ERR(from)) {
error = PTR_ERR(from);
goto exit;
goto put_new;
}
to = filename_parentat(newdfd, getname(newname), lookup_flags,
&new_path, &new_last, &new_type);
to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
&new_type);
if (IS_ERR(to)) {
error = PTR_ERR(to);
goto exit1;
......@@ -4480,34 +4490,40 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
if (retry_estale(error, lookup_flags))
should_retry = true;
path_put(&new_path);
putname(to);
exit1:
path_put(&old_path);
putname(from);
if (should_retry) {
should_retry = false;
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
exit:
put_both:
if (!IS_ERR(from))
putname(from);
put_new:
if (!IS_ERR(to))
putname(to);
return error;
}
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, unsigned int, flags)
{
return do_renameat2(olddfd, oldname, newdfd, newname, flags);
return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
flags);
}
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
return do_renameat2(olddfd, oldname, newdfd, newname, 0);
return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
0);
}
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
getname(newname), 0);
}
int readlink_copy(char __user *buffer, int buflen, const char *link)
......
......@@ -1099,6 +1099,12 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
lookup_flags |= LOOKUP_BENEATH;
if (how->resolve & RESOLVE_IN_ROOT)
lookup_flags |= LOOKUP_IN_ROOT;
if (how->resolve & RESOLVE_CACHED) {
/* Don't bother even trying for create/truncate/tmpfile open */
if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
return -EAGAIN;
lookup_flags |= LOOKUP_CACHED;
}
op->lookup_flags = lookup_flags;
return 0;
......
......@@ -70,7 +70,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
......@@ -260,12 +260,13 @@ static __always_inline void arch_exit_to_user_mode(void) { }
#endif
/**
* arch_do_signal - Architecture specific signal delivery function
* arch_do_signal_or_restart - Architecture specific signal delivery function
* @regs: Pointer to currents pt_regs
* @has_signal: actual signal to handle
*
* Invoked from exit_to_user_mode_loop().
*/
void arch_do_signal(struct pt_regs *regs);
void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal);
/**
* arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
......
......@@ -15,8 +15,8 @@
# define ARCH_XFER_TO_GUEST_MODE_WORK (0)
#endif
#define XFER_TO_GUEST_MODE_WORK \
(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
#define XFER_TO_GUEST_MODE_WORK \
(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \
_TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
struct kvm_vcpu;
......
......@@ -19,7 +19,7 @@
/* List of all valid flags for the how->resolve argument: */
#define VALID_RESOLVE_FLAGS \
(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
RESOLVE_BENEATH | RESOLVE_IN_ROOT)
RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_CACHED)
/* List of all open_how "versions". */
#define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */
......
......@@ -124,7 +124,7 @@ extern void __fd_install(struct files_struct *files,
extern int __close_fd(struct files_struct *files,
unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern int __close_fd_get_file(unsigned int fd, struct file **res);
extern int close_fd_get_file(unsigned int fd, struct file **res);
extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
struct files_struct **new_fdp);
......
......@@ -22,33 +22,34 @@ struct io_identity {
refcount_t count;
};
#ifdef __GENKSYMS__
struct io_uring_task {
/* submission side */
struct xarray xa;
struct wait_queue_head wait;
struct file *last;
struct percpu_counter inflight;
struct io_identity __identity;
struct io_identity *identity;
atomic_t in_idle;
bool sqpoll;
struct xarray xa;
struct wait_queue_head wait;
struct file *last;
struct percpu_counter inflight;
struct io_identity __identity;
struct io_identity *identity;
atomic_t in_idle;
bool sqpoll;
};
#endif
#if defined(CONFIG_IO_URING)
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_task_cancel(void);
void __io_uring_files_cancel(struct files_struct *files);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
static inline void io_uring_task_cancel(void)
static inline void io_uring_files_cancel(void)
{
if (current->io_uring && !xa_empty(&current->io_uring->xa))
__io_uring_task_cancel();
if (current->io_uring)
__io_uring_cancel(false);
}
static inline void io_uring_files_cancel(struct files_struct *files)
static inline void io_uring_task_cancel(void)
{
if (current->io_uring && !xa_empty(&current->io_uring->xa))
__io_uring_files_cancel(files);
if (current->io_uring)
__io_uring_cancel(true);
}
static inline void io_uring_free(struct task_struct *tsk)
{
......@@ -63,7 +64,7 @@ static inline struct sock *io_uring_get_socket(struct file *file)
static inline void io_uring_task_cancel(void)
{
}
static inline void io_uring_files_cancel(struct files_struct *files)
static inline void io_uring_files_cancel(void)
{
}
static inline void io_uring_free(struct task_struct *tsk)
......
......@@ -43,6 +43,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */
#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */
#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */
#define LOOKUP_CACHED 0x200000 /* Only do cached lookup */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
......
......@@ -1399,7 +1399,7 @@ struct task_struct {
*/
randomized_struct_fields_end
KABI_RESERVE(1)
KABI_USE(1, void *pf_io_worker)
KABI_RESERVE(2)
KABI_RESERVE(3)
KABI_RESERVE(4)
......
......@@ -360,11 +360,23 @@ static inline int restart_syscall(void)
return -ERESTARTNOINTR;
}
static inline int signal_pending(struct task_struct *p)
static inline int task_sigpending(struct task_struct *p)
{
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}
static inline int signal_pending(struct task_struct *p)
{
/*
* TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
* behavior in terms of ensuring that we break out of wait loops
* so that notify signal callbacks can be processed.
*/
if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
return 1;
return task_sigpending(p);
}
static inline int __fatal_signal_pending(struct task_struct *p)
{
return unlikely(sigismember(&p->pending.signal, SIGKILL));
......@@ -372,7 +384,7 @@ static inline int __fatal_signal_pending(struct task_struct *p)
static inline int fatal_signal_pending(struct task_struct *p)
{
return signal_pending(p) && __fatal_signal_pending(p);
return task_sigpending(p) && __fatal_signal_pending(p);
}
static inline int signal_pending_state(long state, struct task_struct *p)
......@@ -509,7 +521,7 @@ extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
static inline void restore_saved_sigmask_unless(bool interrupted)
{
if (interrupted)
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
WARN_ON(!signal_pending(current));
else
restore_saved_sigmask();
}
......
......@@ -31,6 +31,7 @@ struct kernel_clone_args {
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
int io_thread;
struct cgroup *cgrp;
struct css_set *cset;
};
......@@ -85,6 +86,7 @@ extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);
extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
......
......@@ -421,6 +421,9 @@ extern int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile);
extern struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
......@@ -436,5 +439,6 @@ extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
int __user *usockaddr_len);
extern int __sys_socketpair(int family, int type, int protocol,
int __user *usockvec);
extern int __sys_shutdown_sock(struct socket *sock, int how);
extern int __sys_shutdown(int fd, int how);
#endif /* _LINUX_SOCKET_H */
......@@ -22,6 +22,8 @@ enum task_work_notify_mode {
int task_work_add(struct task_struct *task, struct callback_head *twork,
enum task_work_notify_mode mode);
struct callback_head *task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data), void *data);
struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
void task_work_run(void);
......
......@@ -202,4 +202,27 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
}
/*
* called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
* is currently used by TWA_SIGNAL based task_work, which requires breaking
* wait loops to ensure that task_work is noticed and run.
*/
static inline void tracehook_notify_signal(void)
{
clear_thread_flag(TIF_NOTIFY_SIGNAL);
smp_mb__after_atomic();
if (current->task_works)
task_work_run();
}
/*
* Called when we have work to process from exit_to_user_mode_loop()
*/
static inline void set_notify_signal(struct task_struct *task)
{
if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
!wake_up_state(task, TASK_INTERRUPTIBLE))
kick_process(task);
}
#endif /* <linux/tracehook.h> */
......@@ -26,6 +26,12 @@ enum iter_type {
ITER_DISCARD = 64,
};
struct iov_iter_state {
size_t iov_offset;
size_t count;
unsigned long nr_segs;
};
struct iov_iter {
/*
* Bit 0 is the read/write bit, set if we're writing.
......@@ -55,6 +61,14 @@ static inline enum iter_type iov_iter_type(const struct iov_iter *i)
return i->type & ~(READ | WRITE);
}
static inline void iov_iter_save_state(struct iov_iter *iter,
struct iov_iter_state *state)
{
state->iov_offset = iter->iov_offset;
state->count = iter->count;
state->nr_segs = iter->nr_segs;
}
static inline bool iter_is_iovec(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_IOVEC;
......@@ -226,6 +240,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
......
......@@ -1334,4 +1334,11 @@ static inline int skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res)
return res->ingress ? netif_receive_skb(skb) : dev_queue_xmit(skb);
}
/* Make sure qdisc is no longer in SCHED state. */
static inline void qdisc_synchronize(const struct Qdisc *q)
{
while (test_bit(__QDISC_STATE_SCHED, &q->state))
msleep(1);
}
#endif
......@@ -12,11 +12,11 @@ struct io_wq_work;
/**
* io_uring_create - called after a new io_uring context was prepared
*
* @fd: corresponding file descriptor
* @ctx: pointer to a ring context structure
* @fd: corresponding file descriptor
* @ctx: pointer to a ring context structure
* @sq_entries: actual SQ size
* @cq_entries: actual CQ size
* @flags: SQ ring flags, provided to io_uring_setup(2)
* @flags: SQ ring flags, provided to io_uring_setup(2)
*
* Allows to trace io_uring creation and provide pointer to a context, that can
* be used later to find correlated events.
......@@ -49,15 +49,15 @@ TRACE_EVENT(io_uring_create,
);
/**
* io_uring_register - called after a buffer/file/eventfd was succesfully
* io_uring_register - called after a buffer/file/eventfd was successfully
* registered for a ring
*
* @ctx: pointer to a ring context structure
* @opcode: describes which operation to perform
* @ctx: pointer to a ring context structure
* @opcode: describes which operation to perform
* @nr_user_files: number of registered files
* @nr_user_bufs: number of registered buffers
* @cq_ev_fd: whether eventfs registered or not
* @ret: return code
* @ret: return code
*
* Allows to trace fixed files/buffers/eventfds, that could be registered to
* avoid an overhead of getting references to them for every operation. This
......@@ -142,16 +142,16 @@ TRACE_EVENT(io_uring_queue_async_work,
TP_ARGS(ctx, rw, req, work, flags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( int, rw )
__field( void *, req )
__field( void *, ctx )
__field( int, rw )
__field( void *, req )
__field( struct io_wq_work *, work )
__field( unsigned int, flags )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->rw = rw;
__entry->rw = rw;
__entry->req = req;
__entry->work = work;
__entry->flags = flags;
......@@ -196,10 +196,10 @@ TRACE_EVENT(io_uring_defer,
/**
* io_uring_link - called before the io_uring request added into link_list of
* another request
* another request
*
* @ctx: pointer to a ring context structure
* @req: pointer to a linked request
* @ctx: pointer to a ring context structure
* @req: pointer to a linked request
* @target_req: pointer to a previous request, that would contain @req
*
* Allows to track linked requests, to understand dependencies between requests
......@@ -212,8 +212,8 @@ TRACE_EVENT(io_uring_link,
TP_ARGS(ctx, req, target_req),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( void *, ctx )
__field( void *, req )
__field( void *, target_req )
),
......@@ -244,7 +244,7 @@ TRACE_EVENT(io_uring_cqring_wait,
TP_ARGS(ctx, min_events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, ctx )
__field( int, min_events )
),
......@@ -272,7 +272,7 @@ TRACE_EVENT(io_uring_fail_link,
TP_ARGS(req, link),
TP_STRUCT__entry (
__field( void *, req )
__field( void *, req )
__field( void *, link )
),
......@@ -290,38 +290,42 @@ TRACE_EVENT(io_uring_fail_link,
* @ctx: pointer to a ring context structure
* @user_data: user data associated with the request
* @res: result of the request
* @cflags: completion flags
*
*/
TRACE_EVENT(io_uring_complete,
TP_PROTO(void *ctx, u64 user_data, long res),
TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),
TP_ARGS(ctx, user_data, res),
TP_ARGS(ctx, user_data, res, cflags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u64, user_data )
__field( long, res )
__field( int, res )
__field( unsigned, cflags )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->user_data = user_data;
__entry->res = res;
__entry->cflags = cflags;
),
TP_printk("ring %p, user_data 0x%llx, result %ld",
TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
__entry->ctx, (unsigned long long)__entry->user_data,
__entry->res)
__entry->res, __entry->cflags)
);
/**
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @opcode: opcode of request
* @user_data: user data associated with the request
* @flags request flags
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
*
......@@ -330,41 +334,60 @@ TRACE_EVENT(io_uring_complete,
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
bool sq_thread),
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags,
bool force_nonblock, bool sq_thread),
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
__field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->flags = flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
),
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->force_nonblock, __entry->sq_thread)
TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
__entry->opcode, (unsigned long long)__entry->user_data,
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
);
/*
* io_uring_poll_arm - called after arming a poll wait if successful
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
* @mask: request poll events mask
* @events: registered events of interest
*
* Allows to track which fds are waiting for and what are the events of
* interest.
*/
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events),
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data,
int mask, int events),
TP_ARGS(ctx, opcode, user_data, mask, events),
TP_ARGS(ctx, req, opcode, user_data, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
......@@ -373,16 +396,17 @@ TRACE_EVENT(io_uring_poll_arm,
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
__entry->events = events;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask, __entry->events)
TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask, __entry->events)
);
TRACE_EVENT(io_uring_poll_wake,
......@@ -437,27 +461,40 @@ TRACE_EVENT(io_uring_task_add,
__entry->mask)
);
/*
* io_uring_task_run - called when task_work_run() executes the poll events
* notification callbacks
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
*
* Allows to track when notified poll events are processed
*/
TRACE_EVENT(io_uring_task_run,
TP_PROTO(void *ctx, u8 opcode, u64 user_data),
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data),
TP_ARGS(ctx, opcode, user_data),
TP_ARGS(ctx, req, opcode, user_data),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
),
TP_printk("ring %p, op %d, data 0x%llx",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data)
TP_printk("ring %p, req %p, op %d, data 0x%llx",
__entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data)
);
#endif /* _TRACE_IO_URING_H */
......
......@@ -42,23 +42,25 @@ struct io_uring_sqe {
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
union {
struct {
/* pack this to avoid bogus arm OABI complaints */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
__s32 splice_fd_in;
};
__u64 __pad2[3];
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
union {
__s32 splice_fd_in;
__u32 file_index;
};
__u64 __pad2[2];
};
enum {
......@@ -132,6 +134,9 @@ enum {
IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS,
IORING_OP_TEE,
IORING_OP_SHUTDOWN,
IORING_OP_RENAMEAT,
IORING_OP_UNLINKAT,
/* this goes last, obviously */
IORING_OP_LAST,
......@@ -145,14 +150,34 @@ enum {
/*
* sqe->timeout_flags
*/
#define IORING_TIMEOUT_ABS (1U << 0)
#define IORING_TIMEOUT_ABS (1U << 0)
#define IORING_TIMEOUT_UPDATE (1U << 1)
#define IORING_TIMEOUT_BOOTTIME (1U << 2)
#define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
* sqe->splice_flags
* extends splice(2) flags
*/
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
/*
* POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
* command flags for POLL_ADD are stored in sqe->len.
*
* IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if
* the poll handler will continue to report
* CQEs on behalf of the same SQE.
*
* IORING_POLL_UPDATE Update existing poll request, matching
* sqe->addr as the old user_data field.
*/
#define IORING_POLL_ADD_MULTI (1U << 0)
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
/*
* IO completion data structure (Completion Queue Entry)
*/
......@@ -166,8 +191,10 @@ struct io_uring_cqe {
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
*/
#define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
......@@ -226,6 +253,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_GETEVENTS (1U << 0)
#define IORING_ENTER_SQ_WAKEUP (1U << 1)
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
......@@ -253,6 +281,10 @@ struct io_uring_params {
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
#define IORING_FEAT_FAST_POLL (1U << 5)
#define IORING_FEAT_POLL_32BITS (1U << 6)
#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
#define IORING_FEAT_EXT_ARG (1U << 8)
#define IORING_FEAT_NATIVE_WORKERS (1U << 9)
#define IORING_FEAT_RSRC_TAGS (1U << 10)
/*
* io_uring_register(2) opcodes and arguments
......@@ -272,16 +304,62 @@ enum {
IORING_REGISTER_RESTRICTIONS = 11,
IORING_REGISTER_ENABLE_RINGS = 12,
/* extended with tagging */
IORING_REGISTER_FILES2 = 13,
IORING_REGISTER_FILES_UPDATE2 = 14,
IORING_REGISTER_BUFFERS2 = 15,
IORING_REGISTER_BUFFERS_UPDATE = 16,
/* set/clear io-wq thread affinities */
IORING_REGISTER_IOWQ_AFF = 17,
IORING_UNREGISTER_IOWQ_AFF = 18,
/* set/get max number of io-wq workers */
IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
/* this goes last */
IORING_REGISTER_LAST
};
/* io-wq worker categories */
enum {
IO_WQ_BOUND,
IO_WQ_UNBOUND,
};
/* deprecated, see struct io_uring_rsrc_update */
struct io_uring_files_update {
__u32 offset;
__u32 resv;
__aligned_u64 /* __s32 * */ fds;
};
struct io_uring_rsrc_register {
__u32 nr;
__u32 resv;
__u64 resv2;
__aligned_u64 data;
__aligned_u64 tags;
};
struct io_uring_rsrc_update {
__u32 offset;
__u32 resv;
__aligned_u64 data;
};
struct io_uring_rsrc_update2 {
__u32 offset;
__u32 resv;
__aligned_u64 data;
__aligned_u64 tags;
__u32 nr;
__u32 resv2;
};
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
......@@ -329,4 +407,11 @@ enum {
IORING_RESTRICTION_LAST
};
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad;
__u64 ts;
};
#endif
......@@ -35,5 +35,9 @@ struct open_how {
#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".."
be scoped inside the dirfd
(similar to chroot(2)). */
#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be
completed through cached lookup. May
return -EAGAIN if that's not
possible. */
#endif /* _UAPI_LINUX_OPENAT2_H */
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for io_uring
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
......@@ -9,19 +9,13 @@
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
#include <linux/task_work.h>
#include <linux/blk-cgroup.h>
#include <linux/audit.h>
#include <linux/cpu.h>
#include <linux/tracehook.h>
#include <uapi/linux/io_uring.h>
#include "../kernel/sched/sched.h"
#include "io-wq.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
......@@ -30,18 +24,15 @@ enum {
IO_WORKER_F_UP = 1, /* up and active */
IO_WORKER_F_RUNNING = 2, /* account as running */
IO_WORKER_F_FREE = 4, /* worker on free list */
IO_WORKER_F_FIXED = 8, /* static idle worker */
IO_WORKER_F_BOUND = 16, /* is doing bounded work */
IO_WORKER_F_BOUND = 8, /* is doing bounded work */
};
enum {
IO_WQ_BIT_EXIT = 0, /* wq exiting */
IO_WQ_BIT_CANCEL = 1, /* cancel work on list */
IO_WQ_BIT_ERROR = 2, /* error on setup */
};
enum {
IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
IO_ACCT_STALLED_BIT = 0, /* stalled on hash */
};
/*
......@@ -58,16 +49,16 @@ struct io_worker {
struct io_wq_work *cur_work;
spinlock_t lock;
struct rcu_head rcu;
struct mm_struct *mm;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
struct nsproxy *restore_nsproxy;
struct fs_struct *restore_fs;
struct completion ref_done;
unsigned long create_state;
struct callback_head create_work;
int create_index;
union {
struct rcu_head rcu;
struct work_struct work;
};
};
#if BITS_PER_LONG == 64
......@@ -81,57 +72,77 @@ struct io_worker {
struct io_wqe_acct {
unsigned nr_workers;
unsigned max_workers;
int index;
atomic_t nr_running;
struct io_wq_work_list work_list;
unsigned long flags;
};
enum {
IO_WQ_ACCT_BOUND,
IO_WQ_ACCT_UNBOUND,
IO_WQ_ACCT_NR,
};
/*
* Per-node worker thread pool
*/
struct io_wqe {
struct {
raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long hash_map;
unsigned flags;
} ____cacheline_aligned_in_smp;
raw_spinlock_t lock;
struct io_wqe_acct acct[2];
int node;
struct io_wqe_acct acct[2];
struct hlist_nulls_head free_list;
struct list_head all_list;
struct wait_queue_entry wait;
struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
cpumask_var_t cpu_mask;
};
/*
* Per io_wq state
*/
struct io_wq {
struct io_wqe **wqes;
unsigned long state;
free_work_fn *free_work;
io_wq_work_fn *do_work;
struct task_struct *manager;
struct user_struct *user;
refcount_t refs;
struct completion done;
struct io_wq_hash *hash;
atomic_t worker_refs;
struct completion worker_done;
struct hlist_node cpuhp_node;
refcount_t use_refs;
struct task_struct *task;
struct io_wqe *wqes[];
};
static enum cpuhp_state io_wq_online;
struct io_cb_cancel_data {
work_cancel_fn *fn;
void *data;
int nr_running;
int nr_pending;
bool cancel_all;
};
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match);
static void create_worker_cb(struct callback_head *cb);
static void io_wq_cancel_tw_create(struct io_wq *wq);
static bool io_worker_get(struct io_worker *worker)
{
return refcount_inc_not_zero(&worker->ref);
......@@ -140,148 +151,126 @@ static bool io_worker_get(struct io_worker *worker)
static void io_worker_release(struct io_worker *worker)
{
if (refcount_dec_and_test(&worker->ref))
wake_up_process(worker->task);
complete(&worker->ref_done);
}
/*
* Note: drops the wqe->lock if returning true! The caller must re-acquire
* the lock in that case. Some callers need to restart handling if this
* happens, so we can't just re-acquire the lock on behalf of the caller.
*/
static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound)
{
bool dropped_lock = false;
if (worker->saved_creds) {
revert_creds(worker->saved_creds);
worker->cur_creds = worker->saved_creds = NULL;
}
if (current->files != worker->restore_files) {
__acquire(&wqe->lock);
raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
task_lock(current);
current->files = worker->restore_files;
current->nsproxy = worker->restore_nsproxy;
task_unlock(current);
}
return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
}
if (current->fs != worker->restore_fs)
current->fs = worker->restore_fs;
static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
struct io_wq_work *work)
{
return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND));
}
/*
* If we have an active mm, we need to drop the wq lock before unusing
* it. If we do, return true and let the caller retry the idle loop.
*/
if (worker->mm) {
if (!dropped_lock) {
__acquire(&wqe->lock);
raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
}
__set_current_state(TASK_RUNNING);
kthread_unuse_mm(worker->mm);
mmput(worker->mm);
worker->mm = NULL;
}
static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
{
return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND);
}
#ifdef CONFIG_BLK_CGROUP
if (worker->blkcg_css) {
kthread_associate_blkcg(NULL);
worker->blkcg_css = NULL;
}
#endif
if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
return dropped_lock;
static void io_worker_ref_put(struct io_wq *wq)
{
if (atomic_dec_and_test(&wq->worker_refs))
complete(&wq->worker_done);
}
static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
struct io_wq_work *work)
static void io_worker_cancel_cb(struct io_worker *worker)
{
if (work->flags & IO_WQ_WORK_UNBOUND)
return &wqe->acct[IO_WQ_ACCT_UNBOUND];
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
return &wqe->acct[IO_WQ_ACCT_BOUND];
atomic_dec(&acct->nr_running);
raw_spin_lock(&worker->wqe->lock);
acct->nr_workers--;
raw_spin_unlock(&worker->wqe->lock);
io_worker_ref_put(wq);
clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
}
static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
struct io_worker *worker)
static bool io_task_worker_match(struct callback_head *cb, void *data)
{
if (worker->flags & IO_WORKER_F_BOUND)
return &wqe->acct[IO_WQ_ACCT_BOUND];
struct io_worker *worker;
return &wqe->acct[IO_WQ_ACCT_UNBOUND];
if (cb->func != create_worker_cb)
return false;
worker = container_of(cb, struct io_worker, create_work);
return worker == data;
}
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
struct io_wq *wq = wqe->wq;
/*
* If we're not at zero, someone else is holding a brief reference
* to the worker. Wait for that to go away.
*/
set_current_state(TASK_INTERRUPTIBLE);
if (!refcount_dec_and_test(&worker->ref))
schedule();
__set_current_state(TASK_RUNNING);
while (1) {
struct callback_head *cb = task_work_cancel_match(wq->task,
io_task_worker_match, worker);
if (!cb)
break;
io_worker_cancel_cb(worker);
}
if (refcount_dec_and_test(&worker->ref))
complete(&worker->ref_done);
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wqe->lock);
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
preempt_disable();
current->flags &= ~PF_IO_WORKER;
if (worker->flags & IO_WORKER_F_RUNNING)
atomic_dec(&acct->nr_running);
if (!(worker->flags & IO_WORKER_F_BOUND))
atomic_dec(&wqe->wq->user->processes);
io_wqe_dec_running(worker);
worker->flags = 0;
current->flags &= ~PF_IO_WORKER;
preempt_enable();
raw_spin_lock_irq(&wqe->lock);
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
if (__io_worker_unuse(wqe, worker)) {
__release(&wqe->lock);
raw_spin_lock_irq(&wqe->lock);
}
acct->nr_workers--;
raw_spin_unlock_irq(&wqe->lock);
raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
if (refcount_dec_and_test(&wqe->wq->refs))
complete(&wqe->wq->done);
io_worker_ref_put(wqe->wq);
do_exit(0);
}
static inline bool io_wqe_run_queue(struct io_wqe *wqe)
__must_hold(wqe->lock)
static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
if (!wq_list_empty(&wqe->work_list) &&
!(wqe->flags & IO_WQE_FLAG_STALLED))
if (!wq_list_empty(&acct->work_list) &&
!test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
return true;
return false;
}
/*
* Check head of free list for an available worker. If one isn't available,
* caller must wake up the wq manager to create one.
* caller must create one.
*/
static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
struct io_wqe_acct *acct)
__must_hold(RCU)
{
struct hlist_nulls_node *n;
struct io_worker *worker;
n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
if (is_a_nulls(n))
return false;
worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
if (io_worker_get(worker)) {
wake_up_process(worker->task);
/*
* Iterate free_list and see if we can find an idle worker to
* activate. If a given worker is on the free_list but in the process
* of exiting, keep trying.
*/
hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
if (!io_worker_get(worker))
continue;
if (io_wqe_get_acct(worker) != acct) {
io_worker_release(worker);
continue;
}
if (wake_up_process(worker->task)) {
io_worker_release(worker);
return true;
}
io_worker_release(worker);
return true;
}
return false;
......@@ -289,12 +278,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
/*
* We need a worker. If we find a free one, we're good. If not, and we're
* below the max number of workers, wake up the manager to create one.
* below the max number of workers, create one.
*/
static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
bool ret;
/*
* Most likely an attempt to queue unbounded work on an io_wq that
* wasn't setup with any unbounded workers.
......@@ -302,41 +289,116 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (unlikely(!acct->max_workers))
pr_warn_once("io-wq is not configured for unbound workers");
rcu_read_lock();
ret = io_wqe_activate_free_worker(wqe);
rcu_read_unlock();
if (!ret && acct->nr_workers < acct->max_workers)
wake_up_process(wqe->wq->manager);
raw_spin_lock(&wqe->lock);
if (acct->nr_workers >= acct->max_workers) {
raw_spin_unlock(&wqe->lock);
return true;
}
acct->nr_workers++;
raw_spin_unlock(&wqe->lock);
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
return create_io_worker(wqe->wq, wqe, acct->index);
}
static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker)
static void io_wqe_inc_running(struct io_worker *worker)
{
struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
atomic_inc(&acct->nr_running);
}
static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker)
__must_hold(wqe->lock)
static void create_worker_cb(struct callback_head *cb)
{
struct io_worker *worker;
struct io_wq *wq;
struct io_wqe *wqe;
struct io_wqe_acct *acct;
bool do_create = false;
worker = container_of(cb, struct io_worker, create_work);
wqe = worker->wqe;
wq = wqe->wq;
acct = &wqe->acct[worker->create_index];
raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) {
acct->nr_workers++;
do_create = true;
}
raw_spin_unlock(&wqe->lock);
if (do_create) {
create_io_worker(wq, wqe, worker->create_index);
} else {
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
}
clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
}
static bool io_queue_worker_create(struct io_worker *worker,
struct io_wqe_acct *acct,
task_work_func_t func)
{
struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
io_wqe_wake_worker(wqe, acct);
/* raced with exit, just ignore create call */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
goto fail;
if (!io_worker_get(worker))
goto fail;
/*
* create_state manages ownership of create_work/index. We should
* only need one entry per worker, as the worker going to sleep
* will trigger the condition, and waking will clear it once it
* runs the task_work.
*/
if (test_bit(0, &worker->create_state) ||
test_and_set_bit_lock(0, &worker->create_state))
goto fail_release;
atomic_inc(&wq->worker_refs);
init_task_work(&worker->create_work, func);
worker->create_index = acct->index;
if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
/*
* EXIT may have been set after checking it above, check after
* adding the task_work and remove any creation item if it is
* now set. wq exit does that too, but we can have added this
* work item after we canceled in io_wq_exit_workers().
*/
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
io_wq_cancel_tw_create(wq);
io_worker_ref_put(wq);
return true;
}
io_worker_ref_put(wq);
clear_bit_unlock(0, &worker->create_state);
fail_release:
io_worker_release(worker);
fail:
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
return false;
}
static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
static void io_wqe_dec_running(struct io_worker *worker)
__must_hold(wqe->lock)
{
allow_kernel_signal(SIGINT);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
current->flags |= PF_IO_WORKER;
if (!(worker->flags & IO_WORKER_F_UP))
return;
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
worker->restore_files = current->files;
worker->restore_nsproxy = current->nsproxy;
worker->restore_fs = current->fs;
io_wqe_inc_running(wqe, worker);
if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
raw_spin_unlock(&wqe->lock);
io_queue_worker_create(worker, acct, create_worker_cb);
raw_spin_lock(&wqe->lock);
}
}
/*
......@@ -347,34 +409,10 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
struct io_wq_work *work)
__must_hold(wqe->lock)
{
bool worker_bound, work_bound;
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
hlist_nulls_del_init_rcu(&worker->nulls_node);
}
/*
* If worker is moving from bound to unbound (or vice versa), then
* ensure we update the running accounting.
*/
worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
if (worker_bound != work_bound) {
io_wqe_dec_running(wqe, worker);
if (work_bound) {
worker->flags |= IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
atomic_dec(&wqe->wq->user->processes);
} else {
worker->flags &= ~IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
atomic_inc(&wqe->wq->user->processes);
}
io_wqe_inc_running(wqe, worker);
}
}
/*
......@@ -384,15 +422,13 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
* retry the loop in that case (we changed task state), we don't regrab
* the lock if we return success.
*/
static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
__must_hold(wqe->lock)
{
if (!(worker->flags & IO_WORKER_F_FREE)) {
worker->flags |= IO_WORKER_F_FREE;
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
}
return __io_worker_unuse(wqe, worker);
}
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
......@@ -400,130 +436,102 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
return work->flags >> IO_WQ_HASH_SHIFT;
}
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
{
struct io_wq *wq = wqe->wq;
bool ret = false;
spin_lock_irq(&wq->hash->wait.lock);
if (list_empty(&wqe->wait.entry)) {
__add_wait_queue(&wq->hash->wait, &wqe->wait);
if (!test_bit(hash, &wq->hash->map)) {
__set_current_state(TASK_RUNNING);
list_del_init(&wqe->wait.entry);
ret = true;
}
}
spin_unlock_irq(&wq->hash->wait.lock);
return ret;
}
static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
struct io_worker *worker)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
unsigned int hash;
unsigned int stall_hash = -1U;
struct io_wqe *wqe = worker->wqe;
wq_list_for_each(node, prev, &acct->work_list) {
unsigned int hash;
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
if (!io_wq_is_hashed(work)) {
wq_list_del(&wqe->work_list, node, prev);
wq_list_del(&acct->work_list, node, prev);
return work;
}
/* hashed, can run if not already running */
hash = io_get_work_hash(work);
if (!(wqe->hash_map & BIT(hash))) {
wqe->hash_map |= BIT(hash);
/* all items with this hash lie in [work, tail] */
tail = wqe->hash_tail[hash];
/* all items with this hash lie in [work, tail] */
tail = wqe->hash_tail[hash];
/* hashed, can run if not already running */
if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
wqe->hash_tail[hash] = NULL;
wq_list_cut(&wqe->work_list, &tail->list, prev);
wq_list_cut(&acct->work_list, &tail->list, prev);
return work;
}
if (stall_hash == -1U)
stall_hash = hash;
/* fast forward to a next hash, for-each will fix up @prev */
node = &tail->list;
}
return NULL;
}
static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
{
if (worker->mm) {
kthread_unuse_mm(worker->mm);
mmput(worker->mm);
worker->mm = NULL;
}
if (mmget_not_zero(work->identity->mm)) {
kthread_use_mm(work->identity->mm);
worker->mm = work->identity->mm;
return;
}
if (stall_hash != -1U) {
bool unstalled;
/* failed grabbing mm, ensure work gets cancelled */
work->flags |= IO_WQ_WORK_CANCEL;
}
static inline void io_wq_switch_blkcg(struct io_worker *worker,
struct io_wq_work *work)
{
#ifdef CONFIG_BLK_CGROUP
if (!(work->flags & IO_WQ_WORK_BLKCG))
return;
if (work->identity->blkcg_css != worker->blkcg_css) {
kthread_associate_blkcg(work->identity->blkcg_css);
worker->blkcg_css = work->identity->blkcg_css;
/*
* Set this before dropping the lock to avoid racing with new
* work being added and clearing the stalled bit.
*/
set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
raw_spin_unlock(&wqe->lock);
unstalled = io_wait_on_hash(wqe, stall_hash);
raw_spin_lock(&wqe->lock);
if (unstalled) {
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (wq_has_sleeper(&wqe->wq->hash->wait))
wake_up(&wqe->wq->hash->wait);
}
}
#endif
}
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
const struct cred *old_creds = override_creds(work->identity->creds);
worker->cur_creds = work->identity->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
worker->saved_creds = old_creds;
return NULL;
}
static void io_impersonate_work(struct io_worker *worker,
struct io_wq_work *work)
static bool io_flush_signals(void)
{
if ((work->flags & IO_WQ_WORK_FILES) &&
current->files != work->identity->files) {
task_lock(current);
current->files = work->identity->files;
current->nsproxy = work->identity->nsproxy;
task_unlock(current);
if (!work->identity->files) {
/* failed grabbing files, ensure work gets cancelled */
work->flags |= IO_WQ_WORK_CANCEL;
}
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
__set_current_state(TASK_RUNNING);
tracehook_notify_signal();
return true;
}
if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
current->fs = work->identity->fs;
if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
io_wq_switch_mm(worker, work);
if ((work->flags & IO_WQ_WORK_CREDS) &&
worker->cur_creds != work->identity->creds)
io_wq_switch_creds(worker, work);
if (work->flags & IO_WQ_WORK_FSIZE)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
io_wq_switch_blkcg(worker, work);
#ifdef CONFIG_AUDIT
current->loginuid = work->identity->loginuid;
current->sessionid = work->identity->sessionid;
#endif
return false;
}
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work) {
/* flush pending signals before assigning new work */
if (signal_pending(current))
flush_signals(current);
io_flush_signals();
cond_resched();
}
#ifdef CONFIG_AUDIT
current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
current->sessionid = AUDIT_SID_UNSET;
#endif
spin_lock_irq(&worker->lock);
spin_lock(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
spin_unlock(&worker->lock);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
......@@ -531,8 +539,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
do {
struct io_wq_work *work;
......@@ -544,101 +554,108 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
work = io_get_next_work(wqe);
work = io_get_next_work(acct, worker);
if (work)
__io_worker_busy(wqe, worker, work);
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
raw_spin_unlock_irq(&wqe->lock);
raw_spin_unlock(&wqe->lock);
if (!work)
break;
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
/* handle a whole dependent link */
do {
struct io_wq_work *old_work, *next_hashed, *linked;
struct io_wq_work *next_hashed, *linked;
unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
io_impersonate_work(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable
* work, the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
old_work = work;
linked = wq->do_work(work);
if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
work->flags |= IO_WQ_WORK_CANCEL;
wq->do_work(work);
io_assign_current_work(worker, NULL);
linked = wq->free_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
linked = NULL;
}
io_assign_current_work(worker, work);
if (current->fs != worker->restore_fs)
current->fs = worker->restore_fs;
wq->free_work(old_work);
if (linked)
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
raw_spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* serialize hash clear with wake_up() */
spin_lock_irq(&wq->hash->wait.lock);
clear_bit(hash, &wq->hash->map);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
spin_unlock_irq(&wq->hash->wait.lock);
if (wq_has_sleeper(&wq->hash->wait))
wake_up(&wq->hash->wait);
raw_spin_lock(&wqe->lock);
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
raw_spin_unlock_irq(&wqe->lock);
raw_spin_unlock(&wqe->lock);
}
} while (work);
raw_spin_lock_irq(&wqe->lock);
raw_spin_lock(&wqe->lock);
} while (1);
}
static int io_wqe_worker(void *data)
{
struct io_worker *worker = data;
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
bool last_timeout = false;
char buf[TASK_COMM_LEN];
io_worker_start(wqe, worker);
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
set_task_comm(current, buf);
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
long ret;
set_current_state(TASK_INTERRUPTIBLE);
loop:
raw_spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
raw_spin_lock(&wqe->lock);
if (io_acct_run_queue(acct)) {
io_worker_handle_work(worker);
goto loop;
}
/* drops the lock on success, retry */
if (__io_worker_idle(wqe, worker)) {
__release(&wqe->lock);
goto loop;
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
acct->nr_workers--;
raw_spin_unlock(&wqe->lock);
__set_current_state(TASK_RUNNING);
break;
}
raw_spin_unlock_irq(&wqe->lock);
if (signal_pending(current))
flush_signals(current);
if (schedule_timeout(WORKER_IDLE_TIMEOUT))
last_timeout = false;
__io_worker_idle(wqe, worker);
raw_spin_unlock(&wqe->lock);
if (io_flush_signals())
continue;
/* timed out, exit unless we're the fixed worker */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
!(worker->flags & IO_WORKER_F_FIXED))
ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
if (signal_pending(current)) {
struct ksignal ksig;
if (!get_signal(&ksig))
continue;
break;
}
last_timeout = !ret;
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
raw_spin_lock_irq(&wqe->lock);
if (!wq_list_empty(&wqe->work_list))
io_worker_handle_work(worker);
else
raw_spin_unlock_irq(&wqe->lock);
raw_spin_lock(&wqe->lock);
io_worker_handle_work(worker);
}
io_worker_exit(worker);
......@@ -650,27 +667,28 @@ static int io_wqe_worker(void *data)
*/
void io_wq_worker_running(struct task_struct *tsk)
{
struct io_worker *worker = kthread_data(tsk);
struct io_wqe *wqe = worker->wqe;
struct io_worker *worker = tsk->pf_io_worker;
if (!worker)
return;
if (!(worker->flags & IO_WORKER_F_UP))
return;
if (worker->flags & IO_WORKER_F_RUNNING)
return;
worker->flags |= IO_WORKER_F_RUNNING;
io_wqe_inc_running(wqe, worker);
io_wqe_inc_running(worker);
}
/*
* Called when worker is going to sleep. If there are no workers currently
* running and we have work pending, wake up a free one or have the manager
* set one up.
* running and we have work pending, wake up a free one or create a new one.
*/
void io_wq_worker_sleeping(struct task_struct *tsk)
{
struct io_worker *worker = kthread_data(tsk);
struct io_wqe *wqe = worker->wqe;
struct io_worker *worker = tsk->pf_io_worker;
if (!worker)
return;
if (!(worker->flags & IO_WORKER_F_UP))
return;
if (!(worker->flags & IO_WORKER_F_RUNNING))
......@@ -678,67 +696,140 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
worker->flags &= ~IO_WORKER_F_RUNNING;
raw_spin_lock_irq(&wqe->lock);
io_wqe_dec_running(wqe, worker);
raw_spin_unlock_irq(&wqe->lock);
raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
raw_spin_unlock(&worker->wqe->lock);
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
struct task_struct *tsk)
{
struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
tsk->pf_io_worker = worker;
worker->task = tsk;
set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
tsk->flags |= PF_NO_SETAFFINITY;
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
if (!worker)
raw_spin_lock(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
raw_spin_unlock(&wqe->lock);
wake_up_new_task(tsk);
}
static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
{
return true;
}
static inline bool io_should_retry_thread(long err)
{
/*
* Prevent perpetual task_work retry, if the task (or its group) is
* exiting.
*/
if (fatal_signal_pending(current))
return false;
refcount_set(&worker->ref, 1);
worker->nulls_node.pprev = NULL;
worker->wqe = wqe;
spin_lock_init(&worker->lock);
switch (err) {
case -EAGAIN:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
case -ERESTARTNOHAND:
return true;
default:
return false;
}
}
static void create_worker_cont(struct callback_head *cb)
{
struct io_worker *worker;
struct task_struct *tsk;
struct io_wqe *wqe;
worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
"io_wqe_worker-%d/%d", index, wqe->node);
if (IS_ERR(worker->task)) {
worker = container_of(cb, struct io_worker, create_work);
clear_bit_unlock(0, &worker->create_state);
wqe = worker->wqe;
tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
if (!IS_ERR(tsk)) {
io_init_new_worker(wqe, worker, tsk);
io_worker_release(worker);
return;
} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
atomic_dec(&acct->nr_running);
raw_spin_lock(&wqe->lock);
acct->nr_workers--;
if (!acct->nr_workers) {
struct io_cb_cancel_data match = {
.fn = io_wq_work_match_all,
.cancel_all = true,
};
while (io_acct_cancel_pending_work(wqe, acct, &match))
raw_spin_lock(&wqe->lock);
}
raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wqe->wq);
kfree(worker);
return false;
return;
}
kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
if (index == IO_WQ_ACCT_BOUND)
worker->flags |= IO_WORKER_F_BOUND;
if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
acct->nr_workers++;
raw_spin_unlock_irq(&wqe->lock);
/* re-create attempts grab a new worker ref, drop the existing one */
io_worker_release(worker);
schedule_work(&worker->work);
}
if (index == IO_WQ_ACCT_UNBOUND)
atomic_inc(&wq->user->processes);
static void io_workqueue_create(struct work_struct *work)
{
struct io_worker *worker = container_of(work, struct io_worker, work);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
refcount_inc(&wq->refs);
wake_up_process(worker->task);
return true;
if (!io_queue_worker_create(worker, acct, create_worker_cont))
kfree(worker);
}
static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
__must_hold(wqe->lock)
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
struct task_struct *tsk;
/* if we have available workers or no work, no need */
if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
__set_current_state(TASK_RUNNING);
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
if (!worker) {
fail:
atomic_dec(&acct->nr_running);
raw_spin_lock(&wqe->lock);
acct->nr_workers--;
raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wq);
return false;
return acct->nr_workers < acct->max_workers;
}
}
static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
{
send_sig(SIGINT, worker->task, 1);
return false;
refcount_set(&worker->ref, 1);
worker->wqe = wqe;
spin_lock_init(&worker->lock);
init_completion(&worker->ref_done);
if (index == IO_WQ_ACCT_BOUND)
worker->flags |= IO_WORKER_F_BOUND;
tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
if (!IS_ERR(tsk)) {
io_init_new_worker(wqe, worker, tsk);
} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
kfree(worker);
goto fail;
} else {
INIT_WORK(&worker->work, io_workqueue_create);
schedule_work(&worker->work);
}
return true;
}
/*
......@@ -768,120 +859,31 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
set_notify_signal(worker->task);
wake_up_process(worker->task);
return false;
}
/*
* Manager thread. Tasked with creating new workers, if we need them.
*/
static int io_wq_manager(void *data)
{
struct io_wq *wq = data;
int node;
/* create fixed workers */
refcount_set(&wq->refs, 1);
for_each_node(node) {
if (!node_online(node))
continue;
if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
continue;
set_bit(IO_WQ_BIT_ERROR, &wq->state);
set_bit(IO_WQ_BIT_EXIT, &wq->state);
goto out;
}
complete(&wq->done);
while (!kthread_should_stop()) {
if (current->task_works)
task_work_run();
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
if (!node_online(node))
continue;
raw_spin_lock_irq(&wqe->lock);
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
fork_worker[IO_WQ_ACCT_BOUND] = true;
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
fork_worker[IO_WQ_ACCT_UNBOUND] = true;
raw_spin_unlock_irq(&wqe->lock);
if (fork_worker[IO_WQ_ACCT_BOUND])
create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
if (fork_worker[IO_WQ_ACCT_UNBOUND])
create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
}
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ);
}
if (current->task_works)
task_work_run();
out:
if (refcount_dec_and_test(&wq->refs)) {
complete(&wq->done);
return 0;
}
/* if ERROR is set and we get here, we have workers to wake */
if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
rcu_read_lock();
for_each_node(node)
io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
rcu_read_unlock();
}
return 0;
}
static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
struct io_wq_work *work)
{
bool free_worker;
if (!(work->flags & IO_WQ_WORK_UNBOUND))
return true;
if (atomic_read(&acct->nr_running))
return true;
rcu_read_lock();
free_worker = !hlist_nulls_empty(&wqe->free_list);
rcu_read_unlock();
if (free_worker)
return true;
if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
!(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
return false;
return true;
}
static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
struct io_wq *wq = wqe->wq;
do {
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
work = wq->do_work(work);
wq->free_work(old_work);
wq->do_work(work);
work = wq->free_work(work);
} while (work);
}
static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
unsigned int hash;
struct io_wq_work *tail;
if (!io_wq_is_hashed(work)) {
append:
wq_list_add_tail(&work->list, &wqe->work_list);
wq_list_add_tail(&work->list, &acct->work_list);
return;
}
......@@ -891,35 +893,62 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
if (!tail)
goto append;
wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
wq_list_add_after(&work->list, &tail->list, &acct->work_list);
}
static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
{
return work == data;
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
bool do_wake;
unsigned long flags;
unsigned work_flags = work->flags;
bool do_create;
/*
* Do early check to see if we need a new unbound worker, and if we do,
* if we're allowed to do so. This isn't 100% accurate as there's a
* gap between this check and incrementing the value, but that's OK.
* It's close enough to not be an issue, fork() has the same delay.
* If io-wq is exiting for this task, or if the request has explicitly
* been marked as one that should not get executed, cancel it here.
*/
if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
(work->flags & IO_WQ_WORK_CANCEL)) {
io_run_cancel(work, wqe);
return;
}
raw_spin_lock_irqsave(&wqe->lock, flags);
raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running);
raw_spin_unlock_irqrestore(&wqe->lock, flags);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (do_wake)
io_wqe_wake_worker(wqe, acct);
rcu_read_lock();
do_create = !io_wqe_activate_free_worker(wqe, acct);
rcu_read_unlock();
raw_spin_unlock(&wqe->lock);
if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))) {
bool did_create;
did_create = io_wqe_create_worker(wqe, acct);
if (likely(did_create))
return;
raw_spin_lock(&wqe->lock);
/* fatal condition, failed to create the first worker */
if (!acct->nr_workers) {
struct io_cb_cancel_data match = {
.fn = io_wq_work_match_item,
.data = work,
.cancel_all = false,
};
if (io_acct_cancel_pending_work(wqe, acct, &match))
raw_spin_lock(&wqe->lock);
}
raw_spin_unlock(&wqe->lock);
}
}
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
......@@ -941,46 +970,21 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}
void io_wq_cancel_all(struct io_wq *wq)
{
int node;
set_bit(IO_WQ_BIT_CANCEL, &wq->state);
rcu_read_lock();
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
}
rcu_read_unlock();
}
struct io_cb_cancel_data {
work_cancel_fn *fn;
void *data;
int nr_running;
int nr_pending;
bool cancel_all;
};
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *match = data;
unsigned long flags;
/*
* Hold the lock to avoid ->cur_work going out of scope, caller
* may dereference the passed in work.
*/
spin_lock_irqsave(&worker->lock, flags);
spin_lock(&worker->lock);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
set_notify_signal(worker->task);
match->nr_running++;
}
spin_unlock_irqrestore(&worker->lock, flags);
spin_unlock(&worker->lock);
return match->nr_running && !match->cancel_all;
}
......@@ -989,6 +993,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
struct io_wq_work *work,
struct io_wq_work_node *prev)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
unsigned int hash = io_get_work_hash(work);
struct io_wq_work *prev_work = NULL;
......@@ -1000,33 +1005,48 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
else
wqe->hash_tail[hash] = NULL;
}
wq_list_del(&wqe->work_list, &work->list, prev);
wq_list_del(&acct->work_list, &work->list, prev);
}
static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
struct io_cb_cancel_data *match)
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match)
__releases(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
retry:
raw_spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
raw_spin_unlock_irqrestore(&wqe->lock, flags);
raw_spin_unlock(&wqe->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
if (!match->cancel_all)
return;
/* not safe to continue after unlock */
goto retry;
return true;
}
return false;
}
static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
struct io_cb_cancel_data *match)
{
int i;
retry:
raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
if (match->cancel_all)
goto retry;
return;
}
}
raw_spin_unlock_irqrestore(&wqe->lock, flags);
raw_spin_unlock(&wqe->lock);
}
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
......@@ -1081,9 +1101,28 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
return IO_WQ_CANCEL_NOTFOUND;
}
static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
int i;
list_del_init(&wait->entry);
rcu_read_lock();
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = &wqe->acct[i];
if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
io_wqe_activate_free_worker(wqe, acct);
}
rcu_read_unlock();
return 1;
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
int ret = -ENOMEM, node;
int ret, node, i;
struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work || !data->do_work))
......@@ -1091,24 +1130,18 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
if (WARN_ON_ONCE(!bounded))
return ERR_PTR(-EINVAL);
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
if (!wq->wqes)
goto err_wq;
ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
if (ret)
goto err_wqes;
goto err_wq;
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
wq->free_work = data->free_work;
wq->do_work = data->do_work;
/* caller must already hold a reference to this */
wq->user = data->user;
ret = -ENOMEM;
for_each_node(node) {
struct io_wqe *wqe;
......@@ -1120,113 +1153,234 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
if (!wqe)
goto err;
wq->wqes[node] = wqe;
if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
goto err;
cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
if (wq->user) {
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
INIT_LIST_HEAD(&wqe->wait.entry);
wqe->wait.func = io_wqe_hash_wake;
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = &wqe->acct[i];
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
}
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_LIST_HEAD(&wqe->all_list);
}
init_completion(&wq->done);
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager);
wait_for_completion(&wq->done);
if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
ret = -ENOMEM;
goto err;
}
refcount_set(&wq->use_refs, 1);
reinit_completion(&wq->done);
return wq;
}
ret = PTR_ERR(wq->manager);
complete(&wq->done);
wq->task = get_task_struct(data->task);
atomic_set(&wq->worker_refs, 1);
init_completion(&wq->worker_done);
return wq;
err:
io_wq_put_hash(data->hash);
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node)
for_each_node(node) {
if (!wq->wqes[node])
continue;
free_cpumask_var(wq->wqes[node]->cpu_mask);
kfree(wq->wqes[node]);
err_wqes:
kfree(wq->wqes);
}
err_wq:
kfree(wq);
return ERR_PTR(ret);
}
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
static bool io_task_work_match(struct callback_head *cb, void *data)
{
if (data->free_work != wq->free_work || data->do_work != wq->do_work)
struct io_worker *worker;
if (cb->func != create_worker_cb && cb->func != create_worker_cont)
return false;
worker = container_of(cb, struct io_worker, create_work);
return worker->wqe->wq == data;
}
void io_wq_exit_start(struct io_wq *wq)
{
set_bit(IO_WQ_BIT_EXIT, &wq->state);
}
static void io_wq_cancel_tw_create(struct io_wq *wq)
{
struct callback_head *cb;
return refcount_inc_not_zero(&wq->use_refs);
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
worker = container_of(cb, struct io_worker, create_work);
io_worker_cancel_cb(worker);
}
}
static void __io_wq_destroy(struct io_wq *wq)
static void io_wq_exit_workers(struct io_wq *wq)
{
int node;
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
if (!wq->task)
return;
set_bit(IO_WQ_BIT_EXIT, &wq->state);
if (wq->manager)
kthread_stop(wq->manager);
io_wq_cancel_tw_create(wq);
rcu_read_lock();
for_each_node(node)
io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
rcu_read_unlock();
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
wait_for_completion(&wq->done);
io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
}
rcu_read_unlock();
io_worker_ref_put(wq);
wait_for_completion(&wq->worker_done);
for_each_node(node)
kfree(wq->wqes[node]);
kfree(wq->wqes);
kfree(wq);
for_each_node(node) {
spin_lock_irq(&wq->hash->wait.lock);
list_del_init(&wq->wqes[node]->wait.entry);
spin_unlock_irq(&wq->hash->wait.lock);
}
put_task_struct(wq->task);
wq->task = NULL;
}
void io_wq_destroy(struct io_wq *wq)
static void io_wq_destroy(struct io_wq *wq)
{
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
int node;
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
struct io_cb_cancel_data match = {
.fn = io_wq_work_match_all,
.cancel_all = true,
};
io_wqe_cancel_pending_work(wqe, &match);
free_cpumask_var(wqe->cpu_mask);
kfree(wqe);
}
io_wq_put_hash(wq->hash);
kfree(wq);
}
struct task_struct *io_wq_get_task(struct io_wq *wq)
void io_wq_put_and_exit(struct io_wq *wq)
{
return wq->manager;
WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
io_wq_exit_workers(wq);
io_wq_destroy(wq);
}
struct online_data {
unsigned int cpu;
bool online;
};
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{
struct task_struct *task = worker->task;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(task, &rf);
do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node));
task->flags |= PF_NO_SETAFFINITY;
task_rq_unlock(rq, task, &rf);
struct online_data *od = data;
if (od->online)
cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
else
cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
return false;
}
static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
{
struct online_data od = {
.cpu = cpu,
.online = online
};
int i;
rcu_read_lock();
for_each_node(i)
io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
rcu_read_unlock();
return 0;
}
static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
{
struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
return __io_wq_cpu_online(wq, cpu, true);
}
static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
return __io_wq_cpu_online(wq, cpu, false);
}
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
{
int i;
rcu_read_lock();
for_each_node(i)
io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL);
for_each_node(i) {
struct io_wqe *wqe = wq->wqes[i];
if (mask)
cpumask_copy(wqe->cpu_mask, mask);
else
cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
}
rcu_read_unlock();
return 0;
}
/*
* Set max number of unbounded workers, returns old value. If new_count is 0,
* then just return the old value.
*/
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
int prev[IO_WQ_ACCT_NR];
bool first_node = true;
int i, node;
BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
for (i = 0; i < 2; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
for (i = 0; i < IO_WQ_ACCT_NR; i++)
prev[i] = 0;
rcu_read_lock();
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
struct io_wqe_acct *acct;
raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
acct = &wqe->acct[i];
if (first_node)
prev[i] = max_t(int, acct->max_workers, prev[i]);
if (new_count[i])
acct->max_workers = new_count[i];
}
raw_spin_unlock(&wqe->lock);
first_node = false;
}
rcu_read_unlock();
for (i = 0; i < IO_WQ_ACCT_NR; i++)
new_count[i] = prev[i];
return 0;
}
......@@ -1235,7 +1389,7 @@ static __init int io_wq_init(void)
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
io_wq_cpu_online, NULL);
io_wq_cpu_online, io_wq_cpu_offline);
if (ret < 0)
return ret;
io_wq_online = ret;
......
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
#include <linux/refcount.h>
#include <linux/io_uring.h>
struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_WORK_FILES = 32,
IO_WQ_WORK_FS = 64,
IO_WQ_WORK_MM = 128,
IO_WQ_WORK_CREDS = 256,
IO_WQ_WORK_BLKCG = 512,
IO_WQ_WORK_FSIZE = 1024,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
......@@ -52,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
node->next = NULL;
if (!list->first) {
list->last = node;
WRITE_ONCE(list->first, node);
......@@ -59,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
list->last->next = node;
list->last = node;
}
node->next = NULL;
}
static inline void wq_list_cut(struct io_wq_work_list *list,
......@@ -95,7 +87,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
struct io_identity *identity;
unsigned flags;
};
......@@ -107,37 +98,48 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list);
}
typedef void (free_work_fn)(struct io_wq_work *);
typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
struct io_wq_hash {
refcount_t refs;
unsigned long map;
struct wait_queue_head wait;
};
static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
if (refcount_dec_and_test(&hash->refs))
kfree(hash);
}
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
void io_wq_cancel_all(struct io_wq *wq);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all);
struct task_struct *io_wq_get_task(struct io_wq *wq);
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
......@@ -152,6 +154,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER);
return in_task() && (current->flags & PF_IO_WORKER) &&
current->pf_io_worker;
}
#endif
因为 它太大了无法显示 source diff 。你可以改为 查看blob
......@@ -135,7 +135,15 @@ static __always_inline void exit_to_user_mode(void)
}
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal(struct pt_regs *regs) { }
void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
{
if (ti_work & _TIF_NOTIFY_SIGNAL)
tracehook_notify_signal();
arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
}
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
......@@ -157,8 +165,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
if (ti_work & _TIF_SIGPENDING)
arch_do_signal(regs);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
handle_signal_work(regs, ti_work);
if (ti_work & _TIF_NOTIFY_RESUME) {
tracehook_notify_resume(regs);
......
......@@ -8,6 +8,9 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
do {
int ret;
if (ti_work & _TIF_NOTIFY_SIGNAL)
tracehook_notify_signal();
if (ti_work & _TIF_SIGPENDING) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
......
......@@ -1973,7 +1973,7 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
if (signal_pending(t)) {
if (task_sigpending(t)) {
spin_lock_irq(&t->sighand->siglock);
clear_tsk_thread_flag(t, TIF_SIGPENDING);
spin_unlock_irq(&t->sighand->siglock);
......
......@@ -763,7 +763,7 @@ void __noreturn do_exit(long code)
schedule();
}
io_uring_files_cancel(tsk->files);
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */
......
......@@ -948,6 +948,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
tsk->pf_io_worker = NULL;
account_kernel_stack(tsk, 1);
......@@ -1994,6 +1995,8 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
if (args->io_thread)
p->flags |= PF_IO_WORKER;
/*
* This _must_ happen before we call free_task(), i.e. before we jump
......@@ -2474,6 +2477,34 @@ struct mm_struct *copy_init_mm(void)
return dup_mm(NULL, &init_mm);
}
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
* The returned task is inactive, and the caller must fire it up through
* wake_up_new_task(p). All signals are blocked in the created task.
*/
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
CLONE_IO;
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.stack = (unsigned long)fn,
.stack_size = (unsigned long)arg,
.io_thread = 1,
};
struct task_struct *tsk;
tsk = copy_process(NULL, 0, node, &args);
if (!IS_ERR(tsk)) {
sigfillset(&tsk->blocked);
sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
}
return tsk;
}
/*
* Ok, this is the main fork-routine.
*
......
......@@ -986,6 +986,7 @@ void __rcu_irq_enter_check_tick(void)
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
#endif /* CONFIG_NO_HZ_FULL */
/**
......
......@@ -21,7 +21,7 @@
#include <asm/tlb.h>
#include "../workqueue_internal.h"
#include "../../fs/io-wq.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
#include "pelt.h"
......
......@@ -986,7 +986,7 @@ static inline bool wants_signal(int sig, struct task_struct *p)
if (task_is_stopped_or_traced(p))
return false;
return task_curr(p) || !signal_pending(p);
return task_curr(p) || !task_sigpending(p);
}
static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
......@@ -2526,6 +2526,18 @@ bool get_signal(struct ksignal *ksig)
struct signal_struct *signal = current->signal;
int signr;
/*
* For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
* that the arch handlers don't all have to do it. If we get here
* without TIF_SIGPENDING, just exit after running signal work.
*/
if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
tracehook_notify_signal();
if (!task_sigpending(current))
return false;
}
if (unlikely(uprobe_deny_signal()))
return false;
......@@ -2819,7 +2831,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
/* Remove the signals this thread can handle. */
sigandsets(&retarget, &retarget, &t->blocked);
if (!signal_pending(t))
if (!task_sigpending(t))
signal_wake_up(t, 0);
if (sigisemptyset(&retarget))
......@@ -2853,7 +2865,7 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_end(tsk);
if (!signal_pending(tsk))
if (!task_sigpending(tsk))
goto out;
unblocked = tsk->blocked;
......@@ -2897,7 +2909,7 @@ long do_no_restart_syscall(struct restart_block *param)
static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
if (signal_pending(tsk) && !thread_group_empty(tsk)) {
if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
sigset_t newblocked;
/* A set of now blocked but previously unblocked signals. */
sigandnsets(&newblocked, newset, &current->blocked);
......
......@@ -70,18 +70,17 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
}
/**
* task_work_cancel - cancel a pending work added by task_work_add()
* task_work_cancel_match - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
* @func: identifies the work to remove
*
* Find the last queued pending work with ->func == @func and remove
* it from queue.
* @match: match function to call
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data),
void *data)
{
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
......@@ -97,7 +96,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
while ((work = READ_ONCE(*pprev))) {
if (work->func != func)
if (!match(work, data))
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
break;
......@@ -107,6 +106,28 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
return work;
}
static bool task_work_func_match(struct callback_head *cb, void *data)
{
return cb->func == data;
}
/**
* task_work_cancel - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
* @func: identifies the work to remove
*
* Find the last queued pending work with ->func == @func and remove
* it from queue.
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
{
return task_work_cancel_match(task, task_work_func_match, func);
}
/**
* task_work_run - execute the works added by task_work_add()
*
......
......@@ -1836,24 +1836,38 @@ int import_single_range(int rw, void __user *buf, size_t len,
}
EXPORT_SYMBOL(import_single_range);
int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
int (*f)(struct kvec *vec, void *context),
void *context)
/**
* iov_iter_restore() - Restore a &struct iov_iter to the same state as when
* iov_iter_save_state() was called.
*
* @i: &struct iov_iter to restore
* @state: state to restore from
*
* Used after iov_iter_save_state() to bring restore @i, if operations may
* have advanced it.
*
* Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
*/
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
struct kvec w;
int err = -EINVAL;
if (!bytes)
return 0;
iterate_all_kinds(i, bytes, v, -EINVAL, ({
w.iov_base = kmap(v.bv_page) + v.bv_offset;
w.iov_len = v.bv_len;
err = f(&w, context);
kunmap(v.bv_page);
err;}), ({
w = v;
err = f(&w, context);})
)
return err;
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
!iov_iter_is_kvec(i))
return;
i->iov_offset = state->iov_offset;
i->count = state->count;
/*
* For the *vec iters, nr_segs + iov is constant - if we increment
* the vec, then we also decrement the nr_segs count. Hence we don't
* need to track both of these, just one is enough and we can deduct
* the other from that. ITER_KVEC and ITER_IOVEC are the same struct
* size, so we can just increment the iov pointer as they are unionzed.
* ITER_BVEC _may_ be the same size on some archs, but on others it is
* not. Be safe and handle it separately.
*/
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
if (iov_iter_is_bvec(i))
i->bvec -= state->nr_segs - i->nr_segs;
else
i->iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs;
}
EXPORT_SYMBOL(iov_iter_for_each_range);
......@@ -1427,6 +1427,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
free:
kfree(table);
out:
mdev->sysctl = NULL;
return -ENOBUFS;
}
......@@ -1436,6 +1437,9 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev,
struct net *net = dev_net(dev);
struct ctl_table *table;
if (!mdev->sysctl)
return;
table = mdev->sysctl->ctl_table_arg;
unregister_net_sysctl_table(mdev->sysctl);
kfree(table);
......
......@@ -1619,6 +1619,7 @@ static void taprio_reset(struct Qdisc *sch)
int i;
hrtimer_cancel(&q->advance_timer);
if (q->qdiscs) {
for (i = 0; i < dev->num_tx_queues; i++)
if (q->qdiscs[i])
......@@ -1642,6 +1643,7 @@ static void taprio_destroy(struct Qdisc *sch)
* happens in qdisc_create(), after taprio_init() has been called.
*/
hrtimer_cancel(&q->advance_timer);
qdisc_synchronize(sch);
taprio_disable_offload(dev, q, NULL);
......
......@@ -1688,30 +1688,22 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
return __sys_listen(fd, backlog);
}
int __sys_accept4_file(struct file *file, unsigned file_flags,
struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile)
int __user *upeer_addrlen, int flags)
{
struct socket *sock, *newsock;
struct file *newfile;
int err, len, newfd;
int err, len;
struct sockaddr_storage address;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
sock = sock_from_file(file, &err);
if (!sock)
goto out;
return ERR_PTR(err);
err = -ENFILE;
newsock = sock_alloc();
if (!newsock)
goto out;
return ERR_PTR(-ENFILE);
newsock->type = sock->type;
newsock->ops = sock->ops;
......@@ -1722,18 +1714,9 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
*/
__module_get(newsock->ops->owner);
newfd = __get_unused_fd_flags(flags, nofile);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
goto out;
}
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
if (IS_ERR(newfile)) {
err = PTR_ERR(newfile);
put_unused_fd(newfd);
goto out;
}
if (IS_ERR(newfile))
return newfile;
err = security_socket_accept(sock, newsock);
if (err)
......@@ -1758,16 +1741,38 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
}
/* File flags are not inherited via accept() unlike another OSes. */
fd_install(newfd, newfile);
err = newfd;
out:
return err;
return newfile;
out_fd:
fput(newfile);
put_unused_fd(newfd);
goto out;
return ERR_PTR(err);
}
int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile)
{
struct file *newfile;
int newfd;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
newfd = __get_unused_fd_flags(flags, nofile);
if (unlikely(newfd < 0))
return newfd;
newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen,
flags);
if (IS_ERR(newfile)) {
put_unused_fd(newfd);
return PTR_ERR(newfile);
}
fd_install(newfd, newfile);
return newfd;
}
/*
......@@ -2181,6 +2186,17 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
* Shutdown a socket.
*/
int __sys_shutdown_sock(struct socket *sock, int how)
{
int err;
err = security_socket_shutdown(sock, how);
if (!err)
err = sock->ops->shutdown(sock, how);
return err;
}
int __sys_shutdown(int fd, int how)
{
int err, fput_needed;
......@@ -2188,9 +2204,7 @@ int __sys_shutdown(int fd, int how)
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock != NULL) {
err = security_socket_shutdown(sock, how);
if (!err)
err = sock->ops->shutdown(sock, how);
err = __sys_shutdown_sock(sock, how);
fput_light(sock->file, fput_needed);
}
return err;
......
......@@ -421,6 +421,9 @@ extern int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile);
extern struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
......@@ -436,5 +439,6 @@ extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
int __user *usockaddr_len);
extern int __sys_socketpair(int family, int type, int protocol,
int __user *usockvec);
extern int __sys_shutdown_sock(struct socket *sock, int how);
extern int __sys_shutdown(int fd, int how);
#endif /* _LINUX_SOCKET_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册