未验证 提交 1f74be50 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!419 Backport CVEs and bugfixes

Merge Pull Request from: @zhangjialin11 
 
Pull new CVEs:
CVE-2023-26545
CVE-2023-0045
CVE-2023-20938
CVE-2023-0240

rcu bugfix from Zheng Yejian
net bugfixes from Zhengchao Shao
block bugfix from Zhong Jinghua
md/raid10 bugfixes from Li Nan
arm/kasan bugfix from Longlong Xia 
 
Link:https://gitee.com/openeuler/kernel/pulls/419 

Reviewed-by: Zheng Zengkai <zhengzengkai@huawei.com> 
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com> 
......@@ -1129,7 +1129,7 @@ export MODORDER := $(extmod-prefix)modules.order
export MODULES_NSDEPS := $(extmod-prefix)modules.nsdeps
ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ io_uring/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
......
......@@ -137,6 +137,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
* thread information flags:
* TIF_USEDFPU - FPU was used by this task this quantum (SMP)
* TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED
*
* Any bit in the range of 0..15 will cause do_work_pending() to be invoked.
*/
#define TIF_SIGPENDING 0 /* signal pending */
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
......@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
#define TIF_PATCH_PENDING 8 /* pending live patching update */
#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */
#define TIF_USING_IWMMXT 17
#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
......@@ -162,6 +165,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
/* Checks for any syscall work in entry-common.S */
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
......@@ -171,7 +175,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
* Change these and you break ASM code in entry-common.S
*/
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE)
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NOTIFY_SIGNAL)
#endif /* __KERNEL__ */
#endif /* __ASM_ARM_THREAD_INFO_H */
......@@ -54,7 +54,7 @@ __ret_fast_syscall:
cmp r2, r1
blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
movs r1, r1, lsl #16
bne fast_work_pending
......@@ -92,7 +92,7 @@ __ret_fast_syscall:
cmp r2, r1
blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
movs r1, r1, lsl #16
beq no_work_pending
UNWIND(.fnend )
ENDPROC(ret_fast_syscall)
......@@ -134,7 +134,7 @@ ENTRY(ret_to_user_from_irq)
cmp r2, r1
blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS]
tst r1, #_TIF_WORK_MASK
movs r1, r1, lsl #16
bne slow_work_pending
no_work_pending:
asm_trace_hardirqs_on save = 0
......
......@@ -59,7 +59,7 @@ __irq_entry:
get_thread_info tsk
ldr r2, [tsk, #TI_FLAGS]
tst r2, #_TIF_WORK_MASK
movs r2, r2, lsl #16
beq 2f @ no work pending
mov r0, #V7M_SCB_ICSR_PENDSVSET
str r0, [r1, V7M_SCB_ICSR] @ raise PendSV
......
......@@ -655,7 +655,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
if (unlikely(!user_mode(regs)))
return 0;
local_irq_enable();
if (thread_flags & _TIF_SIGPENDING) {
if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
int restart = do_signal(regs, syscall);
if (unlikely(restart)) {
/*
......
......@@ -264,12 +264,17 @@ void __init kasan_init(void)
/*
* 1. The module global variables are in MODULES_VADDR ~ MODULES_END,
* so we need to map this area.
* so we need to map this area if CONFIG_KASAN_VMALLOC=n. With
* VMALLOC support KASAN will manage this region dynamically,
* refer to kasan_populate_vmalloc() and ARM's implementation of
* module_alloc().
* 2. PKMAP_BASE ~ PKMAP_BASE+PMD_SIZE's shadow and MODULES_VADDR
* ~ MODULES_END's shadow is in the same PMD_SIZE, so we can't
* use kasan_populate_zero_shadow.
*/
create_mapping((void *)MODULES_VADDR, (void *)(PKMAP_BASE + PMD_SIZE));
if (!IS_ENABLED(CONFIG_KASAN_VMALLOC) && IS_ENABLED(CONFIG_MODULES))
create_mapping((void *)MODULES_VADDR, (void *)(MODULES_END));
create_mapping((void *)PKMAP_BASE, (void *)(PKMAP_BASE + PMD_SIZE));
/*
* KAsan may reuse the contents of kasan_early_shadow_pte directly, so
......
......@@ -69,6 +69,7 @@ void arch_release_task_struct(struct task_struct *tsk);
#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */
#define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */
#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
#define TIF_SYSCALL_AUDIT 9 /* syscall auditing */
#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */
......@@ -101,13 +102,15 @@ void arch_release_task_struct(struct task_struct *tsk);
#define _TIF_32BIT (1 << TIF_32BIT)
#define _TIF_SVE (1 << TIF_SVE)
#define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_32BIT_AARCH64 (1 << TIF_32BIT_AARCH64)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
_TIF_UPROBE | _TIF_MTE_ASYNC_FAULT)
_TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
......
......@@ -711,7 +711,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
(void __user *)NULL, current);
}
if (thread_flags & _TIF_SIGPENDING)
if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
do_signal(regs);
if (thread_flags & _TIF_NOTIFY_RESUME) {
......
......@@ -99,6 +99,7 @@ void arch_setup_new_exec(void);
#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
#define TIF_SIGPENDING 1 /* signal pending */
#define TIF_NEED_RESCHED 2 /* rescheduling necessary */
#define TIF_NOTIFY_SIGNAL 3 /* signal notifications exist */
#define TIF_SYSCALL_EMU 4 /* syscall emulation active */
#define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
#define TIF_PATCH_PENDING 6 /* pending live patching update */
......@@ -124,6 +125,7 @@ void arch_setup_new_exec(void);
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
#define _TIF_NOTIFY_SIGNAL (1<<TIF_NOTIFY_SIGNAL)
#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
#define _TIF_32BIT (1<<TIF_32BIT)
#define _TIF_RESTORE_TM (1<<TIF_RESTORE_TM)
......@@ -145,7 +147,8 @@ void arch_setup_new_exec(void);
#define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_RESTORE_TM | _TIF_PATCH_PENDING)
_TIF_RESTORE_TM | _TIF_PATCH_PENDING | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
/* Bits in local_flags */
......
......@@ -318,7 +318,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
if (thread_info_flags & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
if (thread_info_flags & _TIF_SIGPENDING) {
if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
BUG_ON(regs != current->thread.regs);
do_signal(current);
}
......
......@@ -80,6 +80,7 @@ struct thread_info {
#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing */
#define TIF_SECCOMP 8 /* syscall secure computing */
#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
......@@ -88,9 +89,11 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_WORK_MASK \
(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_SYSCALL_WORK \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
......
......@@ -310,7 +310,7 @@ asmlinkage __visible void do_notify_resume(struct pt_regs *regs,
unsigned long thread_info_flags)
{
/* Handle pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
do_signal(regs);
if (thread_info_flags & _TIF_NOTIFY_RESUME)
......
......@@ -99,6 +99,7 @@ struct thread_info {
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
#define TIF_SLD 18 /* Restore split lock detection on context switch */
#define TIF_NOTIFY_SIGNAL 19 /* signal notifications exist */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
......@@ -127,6 +128,7 @@ struct thread_info {
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_SLD (1 << TIF_SLD)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
......
......@@ -1889,6 +1889,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
task_update_spec_tif(task);
if (task == current)
indirect_branch_prediction_barrier();
break;
default:
return -ERANGE;
......
......@@ -798,11 +798,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
void arch_do_signal(struct pt_regs *regs)
void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
{
struct ksignal ksig;
if (get_signal(&ksig)) {
if (has_signal && get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
handle_signal(&ksig, regs);
return;
......
此差异已折叠。
......@@ -1771,7 +1771,6 @@ static int nbd_dev_add(int index)
struct gendisk *disk;
struct request_queue *q;
int err = -ENOMEM;
int first_minor = index << part_shift;
nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
if (!nbd)
......@@ -1835,7 +1834,7 @@ static int nbd_dev_add(int index)
refcount_set(&nbd->refs, 1);
INIT_LIST_HEAD(&nbd->list);
disk->major = NBD_MAJOR;
disk->first_minor = first_minor;
disk->first_minor = index << part_shift;
disk->fops = &nbd_fops;
disk->private_data = nbd;
sprintf(disk->disk_name, "nbd%d", index);
......
......@@ -1365,6 +1365,9 @@ __acquires(bitmap->lock)
sector_t csize;
int err;
if (page >= bitmap->pages)
return NULL;
err = md_bitmap_checkpage(bitmap, page, create, 0);
if (bitmap->bp[page].hijacked ||
......
......@@ -3846,35 +3846,51 @@ static int analyze_sbs(struct mddev *mddev)
*/
int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
{
unsigned long result = 0;
long decimals = -1;
while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
if (*cp == '.')
decimals = 0;
else if (decimals < scale) {
unsigned int value;
value = *cp - '0';
result = result * 10 + value;
if (decimals >= 0)
decimals++;
}
cp++;
}
if (*cp == '\n')
cp++;
if (*cp)
unsigned long result = 0, decimals = 0;
char *pos, *str;
int rv;
str = kmemdup_nul(cp, strlen(cp), GFP_KERNEL);
if (!str)
return -ENOMEM;
pos = strchr(str, '.');
if (pos) {
int cnt = scale;
*pos = '\0';
while (isdigit(*(++pos))) {
if (cnt) {
decimals = decimals * 10 + *pos - '0';
cnt--;
}
}
if (*pos == '\n')
pos++;
if (*pos) {
kfree(str);
return -EINVAL;
}
decimals *= int_pow(10, cnt);
}
rv = kstrtoul(str, 10, &result);
kfree(str);
if (rv)
return rv;
if (result > (ULONG_MAX - decimals) / (unsigned int)int_pow(10, scale))
return -EINVAL;
if (decimals < 0)
decimals = 0;
*res = result * int_pow(10, scale - decimals);
return 0;
*res = result * int_pow(10, scale) + decimals;
return rv;
}
static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
int msec = (mddev->safemode_delay*1000)/HZ;
return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
}
static ssize_t
safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
......@@ -3888,10 +3904,14 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL;
if (msec > UINT_MAX)
return -EINVAL;
if (msec == 0)
mddev->safemode_delay = 0;
else {
unsigned long old_delay = mddev->safemode_delay;
/* HZ <= 1000, so new_delay < UINT_MAX, too */
unsigned long new_delay = (msec*HZ)/1000;
if (new_delay == 0)
......@@ -4543,7 +4563,7 @@ __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_stor
static ssize_t
max_corrected_read_errors_show(struct mddev *mddev, char *page) {
return sprintf(page, "%d\n",
return sprintf(page, "%u\n",
atomic_read(&mddev->max_corr_read_errors));
}
......
......@@ -2325,7 +2325,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors;
struct md_rdev *rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
unsigned int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
/* still own a reference to this rdev, so it cannot
......@@ -2344,7 +2344,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
char b[BDEVNAME_SIZE];
bdevname(rdev->bdev, b);
pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %u:max %u]\n",
mdname(mddev), b,
atomic_read(&rdev->read_errors), max_read_errors);
pr_notice("md/raid10:%s: %s: Failing raid device\n",
......
......@@ -32,8 +32,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
......
......@@ -520,7 +520,7 @@ static bool dump_interrupted(void)
* but then we need to teach dump_write() to restart and clear
* TIF_SIGPENDING.
*/
return signal_pending(current);
return fatal_signal_pending(current) || freezing(current);
}
static void wait_for_dump_helpers(struct file *file)
......
......@@ -23,6 +23,8 @@
#include <linux/close_range.h>
#include <net/sock.h>
#include "internal.h"
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
......@@ -829,9 +831,8 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
}
/*
* variant of __close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
* See close_fd_get_file() below, this variant assumes current->files->file_lock
* is held.
*/
int __close_fd_get_file(unsigned int fd, struct file **res)
{
......@@ -839,26 +840,39 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
struct file *file;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
goto out_unlock;
goto out_err;
file = fdt->fd[fd];
if (!file)
goto out_unlock;
goto out_err;
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
get_file(file);
*res = file;
return 0;
out_unlock:
spin_unlock(&files->file_lock);
out_err:
*res = NULL;
return -ENOENT;
}
/*
* variant of close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
int close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
int ret;
spin_lock(&files->file_lock);
ret = __close_fd_get_file(fd, res);
spin_unlock(&files->file_lock);
return ret;
}
void do_close_on_exec(struct files_struct *files)
{
unsigned i;
......
......@@ -77,6 +77,8 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
long do_rmdir(int dfd, struct filename *name);
long do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
/*
* namespace.c
......@@ -132,6 +134,7 @@ extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
extern int __close_fd_get_file(unsigned int fd, struct file **res);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
......
......@@ -533,6 +533,8 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
p->stack = p->internal;
p->dfd = dfd;
p->name = name;
p->path.mnt = NULL;
p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
p->saved = old;
p->state = 0;
......@@ -607,6 +609,8 @@ static void terminate_walk(struct nameidata *nd)
rcu_read_unlock();
}
nd->depth = 0;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
}
/* path_put is needed afterwards regardless of success or failure */
......@@ -635,6 +639,11 @@ static inline bool legitimize_path(struct nameidata *nd,
static bool legitimize_links(struct nameidata *nd)
{
int i;
if (unlikely(nd->flags & LOOKUP_CACHED)) {
drop_links(nd);
nd->depth = 0;
return false;
}
for (i = 0; i < nd->depth; i++) {
struct saved *last = nd->stack + i;
if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
......@@ -798,6 +807,7 @@ static int complete_walk(struct nameidata *nd)
if (!(nd->state & ND_ROOT_PRESET))
if (!(nd->flags & LOOKUP_IS_SCOPED))
nd->root.mnt = NULL;
nd->flags &= ~LOOKUP_CACHED;
if (!try_to_unlazy(nd))
return -ECHILD;
}
......@@ -2210,6 +2220,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
int error;
const char *s = nd->name->name;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
return ERR_PTR(-EAGAIN);
if (!*s)
flags &= ~LOOKUP_RCU;
if (flags & LOOKUP_RCU)
......@@ -2240,8 +2254,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
nd->root.mnt = NULL;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
......@@ -4353,8 +4365,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
EXPORT_SYMBOL(vfs_rename);
static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
const char __user *newname, unsigned int flags)
int do_renameat2(int olddfd, struct filename *from, int newdfd,
struct filename *to, unsigned int flags)
{
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
......@@ -4362,32 +4374,30 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
struct qstr old_last, new_last;
int old_type, new_type;
struct inode *delegated_inode = NULL;
struct filename *from;
struct filename *to;
unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
bool should_retry = false;
int error;
int error = -EINVAL;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
goto put_both;
if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
(flags & RENAME_EXCHANGE))
return -EINVAL;
goto put_both;
if (flags & RENAME_EXCHANGE)
target_flags = 0;
retry:
from = filename_parentat(olddfd, getname(oldname), lookup_flags,
&old_path, &old_last, &old_type);
from = filename_parentat(olddfd, from, lookup_flags, &old_path,
&old_last, &old_type);
if (IS_ERR(from)) {
error = PTR_ERR(from);
goto exit;
goto put_new;
}
to = filename_parentat(newdfd, getname(newname), lookup_flags,
&new_path, &new_last, &new_type);
to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
&new_type);
if (IS_ERR(to)) {
error = PTR_ERR(to);
goto exit1;
......@@ -4480,34 +4490,40 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
if (retry_estale(error, lookup_flags))
should_retry = true;
path_put(&new_path);
putname(to);
exit1:
path_put(&old_path);
putname(from);
if (should_retry) {
should_retry = false;
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
exit:
put_both:
if (!IS_ERR(from))
putname(from);
put_new:
if (!IS_ERR(to))
putname(to);
return error;
}
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, unsigned int, flags)
{
return do_renameat2(olddfd, oldname, newdfd, newname, flags);
return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
flags);
}
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
return do_renameat2(olddfd, oldname, newdfd, newname, 0);
return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
0);
}
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
getname(newname), 0);
}
int readlink_copy(char __user *buffer, int buflen, const char *link)
......
......@@ -1099,6 +1099,12 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
lookup_flags |= LOOKUP_BENEATH;
if (how->resolve & RESOLVE_IN_ROOT)
lookup_flags |= LOOKUP_IN_ROOT;
if (how->resolve & RESOLVE_CACHED) {
/* Don't bother even trying for create/truncate/tmpfile open */
if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
return -EAGAIN;
lookup_flags |= LOOKUP_CACHED;
}
op->lookup_flags = lookup_flags;
return 0;
......
......@@ -70,7 +70,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
......@@ -260,12 +260,13 @@ static __always_inline void arch_exit_to_user_mode(void) { }
#endif
/**
* arch_do_signal - Architecture specific signal delivery function
* arch_do_signal_or_restart - Architecture specific signal delivery function
* @regs: Pointer to currents pt_regs
* @has_signal: actual signal to handle
*
* Invoked from exit_to_user_mode_loop().
*/
void arch_do_signal(struct pt_regs *regs);
void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal);
/**
* arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
......
......@@ -15,8 +15,8 @@
# define ARCH_XFER_TO_GUEST_MODE_WORK (0)
#endif
#define XFER_TO_GUEST_MODE_WORK \
(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
#define XFER_TO_GUEST_MODE_WORK \
(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \
_TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
struct kvm_vcpu;
......
......@@ -19,7 +19,7 @@
/* List of all valid flags for the how->resolve argument: */
#define VALID_RESOLVE_FLAGS \
(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
RESOLVE_BENEATH | RESOLVE_IN_ROOT)
RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_CACHED)
/* List of all open_how "versions". */
#define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */
......
......@@ -124,7 +124,7 @@ extern void __fd_install(struct files_struct *files,
extern int __close_fd(struct files_struct *files,
unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern int __close_fd_get_file(unsigned int fd, struct file **res);
extern int close_fd_get_file(unsigned int fd, struct file **res);
extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
struct files_struct **new_fdp);
......
......@@ -22,33 +22,34 @@ struct io_identity {
refcount_t count;
};
#ifdef __GENKSYMS__
struct io_uring_task {
/* submission side */
struct xarray xa;
struct wait_queue_head wait;
struct file *last;
struct percpu_counter inflight;
struct io_identity __identity;
struct io_identity *identity;
atomic_t in_idle;
bool sqpoll;
struct xarray xa;
struct wait_queue_head wait;
struct file *last;
struct percpu_counter inflight;
struct io_identity __identity;
struct io_identity *identity;
atomic_t in_idle;
bool sqpoll;
};
#endif
#if defined(CONFIG_IO_URING)
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_task_cancel(void);
void __io_uring_files_cancel(struct files_struct *files);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
static inline void io_uring_task_cancel(void)
static inline void io_uring_files_cancel(void)
{
if (current->io_uring && !xa_empty(&current->io_uring->xa))
__io_uring_task_cancel();
if (current->io_uring)
__io_uring_cancel(false);
}
static inline void io_uring_files_cancel(struct files_struct *files)
static inline void io_uring_task_cancel(void)
{
if (current->io_uring && !xa_empty(&current->io_uring->xa))
__io_uring_files_cancel(files);
if (current->io_uring)
__io_uring_cancel(true);
}
static inline void io_uring_free(struct task_struct *tsk)
{
......@@ -63,7 +64,7 @@ static inline struct sock *io_uring_get_socket(struct file *file)
static inline void io_uring_task_cancel(void)
{
}
static inline void io_uring_files_cancel(struct files_struct *files)
static inline void io_uring_files_cancel(void)
{
}
static inline void io_uring_free(struct task_struct *tsk)
......
......@@ -43,6 +43,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */
#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */
#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */
#define LOOKUP_CACHED 0x200000 /* Only do cached lookup */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
......
......@@ -1399,7 +1399,7 @@ struct task_struct {
*/
randomized_struct_fields_end
KABI_RESERVE(1)
KABI_USE(1, void *pf_io_worker)
KABI_RESERVE(2)
KABI_RESERVE(3)
KABI_RESERVE(4)
......
......@@ -360,11 +360,23 @@ static inline int restart_syscall(void)
return -ERESTARTNOINTR;
}
static inline int signal_pending(struct task_struct *p)
static inline int task_sigpending(struct task_struct *p)
{
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}
static inline int signal_pending(struct task_struct *p)
{
/*
* TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
* behavior in terms of ensuring that we break out of wait loops
* so that notify signal callbacks can be processed.
*/
if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
return 1;
return task_sigpending(p);
}
static inline int __fatal_signal_pending(struct task_struct *p)
{
return unlikely(sigismember(&p->pending.signal, SIGKILL));
......@@ -372,7 +384,7 @@ static inline int __fatal_signal_pending(struct task_struct *p)
static inline int fatal_signal_pending(struct task_struct *p)
{
return signal_pending(p) && __fatal_signal_pending(p);
return task_sigpending(p) && __fatal_signal_pending(p);
}
static inline int signal_pending_state(long state, struct task_struct *p)
......@@ -509,7 +521,7 @@ extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
static inline void restore_saved_sigmask_unless(bool interrupted)
{
if (interrupted)
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
WARN_ON(!signal_pending(current));
else
restore_saved_sigmask();
}
......
......@@ -31,6 +31,7 @@ struct kernel_clone_args {
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
int io_thread;
struct cgroup *cgrp;
struct css_set *cset;
};
......@@ -85,6 +86,7 @@ extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);
extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
......
......@@ -421,6 +421,9 @@ extern int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile);
extern struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
......@@ -436,5 +439,6 @@ extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
int __user *usockaddr_len);
extern int __sys_socketpair(int family, int type, int protocol,
int __user *usockvec);
extern int __sys_shutdown_sock(struct socket *sock, int how);
extern int __sys_shutdown(int fd, int how);
#endif /* _LINUX_SOCKET_H */
......@@ -22,6 +22,8 @@ enum task_work_notify_mode {
int task_work_add(struct task_struct *task, struct callback_head *twork,
enum task_work_notify_mode mode);
struct callback_head *task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data), void *data);
struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
void task_work_run(void);
......
......@@ -202,4 +202,27 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
}
/*
* called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
* is currently used by TWA_SIGNAL based task_work, which requires breaking
* wait loops to ensure that task_work is noticed and run.
*/
static inline void tracehook_notify_signal(void)
{
clear_thread_flag(TIF_NOTIFY_SIGNAL);
smp_mb__after_atomic();
if (current->task_works)
task_work_run();
}
/*
* Called when we have work to process from exit_to_user_mode_loop()
*/
static inline void set_notify_signal(struct task_struct *task)
{
if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
!wake_up_state(task, TASK_INTERRUPTIBLE))
kick_process(task);
}
#endif /* <linux/tracehook.h> */
......@@ -26,6 +26,12 @@ enum iter_type {
ITER_DISCARD = 64,
};
struct iov_iter_state {
size_t iov_offset;
size_t count;
unsigned long nr_segs;
};
struct iov_iter {
/*
* Bit 0 is the read/write bit, set if we're writing.
......@@ -55,6 +61,14 @@ static inline enum iter_type iov_iter_type(const struct iov_iter *i)
return i->type & ~(READ | WRITE);
}
static inline void iov_iter_save_state(struct iov_iter *iter,
struct iov_iter_state *state)
{
state->iov_offset = iter->iov_offset;
state->count = iter->count;
state->nr_segs = iter->nr_segs;
}
static inline bool iter_is_iovec(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_IOVEC;
......@@ -226,6 +240,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
......
......@@ -1334,4 +1334,11 @@ static inline int skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res)
return res->ingress ? netif_receive_skb(skb) : dev_queue_xmit(skb);
}
/* Make sure qdisc is no longer in SCHED state. */
static inline void qdisc_synchronize(const struct Qdisc *q)
{
while (test_bit(__QDISC_STATE_SCHED, &q->state))
msleep(1);
}
#endif
......@@ -12,11 +12,11 @@ struct io_wq_work;
/**
* io_uring_create - called after a new io_uring context was prepared
*
* @fd: corresponding file descriptor
* @ctx: pointer to a ring context structure
* @fd: corresponding file descriptor
* @ctx: pointer to a ring context structure
* @sq_entries: actual SQ size
* @cq_entries: actual CQ size
* @flags: SQ ring flags, provided to io_uring_setup(2)
* @flags: SQ ring flags, provided to io_uring_setup(2)
*
* Allows to trace io_uring creation and provide pointer to a context, that can
* be used later to find correlated events.
......@@ -49,15 +49,15 @@ TRACE_EVENT(io_uring_create,
);
/**
* io_uring_register - called after a buffer/file/eventfd was succesfully
* io_uring_register - called after a buffer/file/eventfd was successfully
* registered for a ring
*
* @ctx: pointer to a ring context structure
* @opcode: describes which operation to perform
* @ctx: pointer to a ring context structure
* @opcode: describes which operation to perform
* @nr_user_files: number of registered files
* @nr_user_bufs: number of registered buffers
* @cq_ev_fd: whether eventfs registered or not
* @ret: return code
* @ret: return code
*
* Allows to trace fixed files/buffers/eventfds, that could be registered to
* avoid an overhead of getting references to them for every operation. This
......@@ -142,16 +142,16 @@ TRACE_EVENT(io_uring_queue_async_work,
TP_ARGS(ctx, rw, req, work, flags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( int, rw )
__field( void *, req )
__field( void *, ctx )
__field( int, rw )
__field( void *, req )
__field( struct io_wq_work *, work )
__field( unsigned int, flags )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->rw = rw;
__entry->rw = rw;
__entry->req = req;
__entry->work = work;
__entry->flags = flags;
......@@ -196,10 +196,10 @@ TRACE_EVENT(io_uring_defer,
/**
* io_uring_link - called before the io_uring request added into link_list of
* another request
* another request
*
* @ctx: pointer to a ring context structure
* @req: pointer to a linked request
* @ctx: pointer to a ring context structure
* @req: pointer to a linked request
* @target_req: pointer to a previous request, that would contain @req
*
* Allows to track linked requests, to understand dependencies between requests
......@@ -212,8 +212,8 @@ TRACE_EVENT(io_uring_link,
TP_ARGS(ctx, req, target_req),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( void *, ctx )
__field( void *, req )
__field( void *, target_req )
),
......@@ -244,7 +244,7 @@ TRACE_EVENT(io_uring_cqring_wait,
TP_ARGS(ctx, min_events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, ctx )
__field( int, min_events )
),
......@@ -272,7 +272,7 @@ TRACE_EVENT(io_uring_fail_link,
TP_ARGS(req, link),
TP_STRUCT__entry (
__field( void *, req )
__field( void *, req )
__field( void *, link )
),
......@@ -290,38 +290,42 @@ TRACE_EVENT(io_uring_fail_link,
* @ctx: pointer to a ring context structure
* @user_data: user data associated with the request
* @res: result of the request
* @cflags: completion flags
*
*/
TRACE_EVENT(io_uring_complete,
TP_PROTO(void *ctx, u64 user_data, long res),
TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),
TP_ARGS(ctx, user_data, res),
TP_ARGS(ctx, user_data, res, cflags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u64, user_data )
__field( long, res )
__field( int, res )
__field( unsigned, cflags )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->user_data = user_data;
__entry->res = res;
__entry->cflags = cflags;
),
TP_printk("ring %p, user_data 0x%llx, result %ld",
TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
__entry->ctx, (unsigned long long)__entry->user_data,
__entry->res)
__entry->res, __entry->cflags)
);
/**
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @opcode: opcode of request
* @user_data: user data associated with the request
* @flags request flags
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
*
......@@ -330,41 +334,60 @@ TRACE_EVENT(io_uring_complete,
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
bool sq_thread),
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags,
bool force_nonblock, bool sq_thread),
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
__field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->flags = flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
),
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->force_nonblock, __entry->sq_thread)
TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
__entry->opcode, (unsigned long long)__entry->user_data,
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
);
/*
* io_uring_poll_arm - called after arming a poll wait if successful
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
* @mask: request poll events mask
* @events: registered events of interest
*
* Allows to track which fds are waiting for and what are the events of
* interest.
*/
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events),
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data,
int mask, int events),
TP_ARGS(ctx, opcode, user_data, mask, events),
TP_ARGS(ctx, req, opcode, user_data, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
......@@ -373,16 +396,17 @@ TRACE_EVENT(io_uring_poll_arm,
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
__entry->events = events;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask, __entry->events)
TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask, __entry->events)
);
TRACE_EVENT(io_uring_poll_wake,
......@@ -437,27 +461,40 @@ TRACE_EVENT(io_uring_task_add,
__entry->mask)
);
/*
* io_uring_task_run - called when task_work_run() executes the poll events
* notification callbacks
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
*
* Allows to track when notified poll events are processed
*/
TRACE_EVENT(io_uring_task_run,
TP_PROTO(void *ctx, u8 opcode, u64 user_data),
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data),
TP_ARGS(ctx, opcode, user_data),
TP_ARGS(ctx, req, opcode, user_data),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
),
TP_printk("ring %p, op %d, data 0x%llx",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data)
TP_printk("ring %p, req %p, op %d, data 0x%llx",
__entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data)
);
#endif /* _TRACE_IO_URING_H */
......
......@@ -42,23 +42,25 @@ struct io_uring_sqe {
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
union {
struct {
/* pack this to avoid bogus arm OABI complaints */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
__s32 splice_fd_in;
};
__u64 __pad2[3];
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
union {
__s32 splice_fd_in;
__u32 file_index;
};
__u64 __pad2[2];
};
enum {
......@@ -132,6 +134,9 @@ enum {
IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS,
IORING_OP_TEE,
IORING_OP_SHUTDOWN,
IORING_OP_RENAMEAT,
IORING_OP_UNLINKAT,
/* this goes last, obviously */
IORING_OP_LAST,
......@@ -145,14 +150,34 @@ enum {
/*
* sqe->timeout_flags
*/
#define IORING_TIMEOUT_ABS (1U << 0)
#define IORING_TIMEOUT_ABS (1U << 0)
#define IORING_TIMEOUT_UPDATE (1U << 1)
#define IORING_TIMEOUT_BOOTTIME (1U << 2)
#define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
* sqe->splice_flags
* extends splice(2) flags
*/
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
/*
* POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
* command flags for POLL_ADD are stored in sqe->len.
*
* IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if
* the poll handler will continue to report
* CQEs on behalf of the same SQE.
*
* IORING_POLL_UPDATE Update existing poll request, matching
* sqe->addr as the old user_data field.
*/
#define IORING_POLL_ADD_MULTI (1U << 0)
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
/*
* IO completion data structure (Completion Queue Entry)
*/
......@@ -166,8 +191,10 @@ struct io_uring_cqe {
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
*/
#define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
......@@ -226,6 +253,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_GETEVENTS (1U << 0)
#define IORING_ENTER_SQ_WAKEUP (1U << 1)
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
......@@ -253,6 +281,10 @@ struct io_uring_params {
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
#define IORING_FEAT_FAST_POLL (1U << 5)
#define IORING_FEAT_POLL_32BITS (1U << 6)
#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
#define IORING_FEAT_EXT_ARG (1U << 8)
#define IORING_FEAT_NATIVE_WORKERS (1U << 9)
#define IORING_FEAT_RSRC_TAGS (1U << 10)
/*
* io_uring_register(2) opcodes and arguments
......@@ -272,16 +304,62 @@ enum {
IORING_REGISTER_RESTRICTIONS = 11,
IORING_REGISTER_ENABLE_RINGS = 12,
/* extended with tagging */
IORING_REGISTER_FILES2 = 13,
IORING_REGISTER_FILES_UPDATE2 = 14,
IORING_REGISTER_BUFFERS2 = 15,
IORING_REGISTER_BUFFERS_UPDATE = 16,
/* set/clear io-wq thread affinities */
IORING_REGISTER_IOWQ_AFF = 17,
IORING_UNREGISTER_IOWQ_AFF = 18,
/* set/get max number of io-wq workers */
IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
/* this goes last */
IORING_REGISTER_LAST
};
/* io-wq worker categories */
enum {
IO_WQ_BOUND,
IO_WQ_UNBOUND,
};
/* deprecated, see struct io_uring_rsrc_update */
struct io_uring_files_update {
__u32 offset;
__u32 resv;
__aligned_u64 /* __s32 * */ fds;
};
struct io_uring_rsrc_register {
__u32 nr;
__u32 resv;
__u64 resv2;
__aligned_u64 data;
__aligned_u64 tags;
};
struct io_uring_rsrc_update {
__u32 offset;
__u32 resv;
__aligned_u64 data;
};
struct io_uring_rsrc_update2 {
__u32 offset;
__u32 resv;
__aligned_u64 data;
__aligned_u64 tags;
__u32 nr;
__u32 resv2;
};
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
......@@ -329,4 +407,11 @@ enum {
IORING_RESTRICTION_LAST
};
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad;
__u64 ts;
};
#endif
......@@ -35,5 +35,9 @@ struct open_how {
#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".."
be scoped inside the dirfd
(similar to chroot(2)). */
#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be
completed through cached lookup. May
return -EAGAIN if that's not
possible. */
#endif /* _UAPI_LINUX_OPENAT2_H */
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for io_uring
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
此差异已折叠。
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
#include <linux/refcount.h>
#include <linux/io_uring.h>
struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_WORK_FILES = 32,
IO_WQ_WORK_FS = 64,
IO_WQ_WORK_MM = 128,
IO_WQ_WORK_CREDS = 256,
IO_WQ_WORK_BLKCG = 512,
IO_WQ_WORK_FSIZE = 1024,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
......@@ -52,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
node->next = NULL;
if (!list->first) {
list->last = node;
WRITE_ONCE(list->first, node);
......@@ -59,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
list->last->next = node;
list->last = node;
}
node->next = NULL;
}
static inline void wq_list_cut(struct io_wq_work_list *list,
......@@ -95,7 +87,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
struct io_identity *identity;
unsigned flags;
};
......@@ -107,37 +98,48 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list);
}
typedef void (free_work_fn)(struct io_wq_work *);
typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
struct io_wq_hash {
refcount_t refs;
unsigned long map;
struct wait_queue_head wait;
};
static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
if (refcount_dec_and_test(&hash->refs))
kfree(hash);
}
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
void io_wq_cancel_all(struct io_wq *wq);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all);
struct task_struct *io_wq_get_task(struct io_wq *wq);
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
......@@ -152,6 +154,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER);
return in_task() && (current->flags & PF_IO_WORKER) &&
current->pf_io_worker;
}
#endif
......@@ -135,7 +135,15 @@ static __always_inline void exit_to_user_mode(void)
}
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal(struct pt_regs *regs) { }
void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
{
if (ti_work & _TIF_NOTIFY_SIGNAL)
tracehook_notify_signal();
arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
}
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
......@@ -157,8 +165,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
if (ti_work & _TIF_SIGPENDING)
arch_do_signal(regs);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
handle_signal_work(regs, ti_work);
if (ti_work & _TIF_NOTIFY_RESUME) {
tracehook_notify_resume(regs);
......
......@@ -8,6 +8,9 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
do {
int ret;
if (ti_work & _TIF_NOTIFY_SIGNAL)
tracehook_notify_signal();
if (ti_work & _TIF_SIGPENDING) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
......
......@@ -1973,7 +1973,7 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
if (signal_pending(t)) {
if (task_sigpending(t)) {
spin_lock_irq(&t->sighand->siglock);
clear_tsk_thread_flag(t, TIF_SIGPENDING);
spin_unlock_irq(&t->sighand->siglock);
......
......@@ -763,7 +763,7 @@ void __noreturn do_exit(long code)
schedule();
}
io_uring_files_cancel(tsk->files);
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */
......
......@@ -948,6 +948,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
tsk->pf_io_worker = NULL;
account_kernel_stack(tsk, 1);
......@@ -1994,6 +1995,8 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
if (args->io_thread)
p->flags |= PF_IO_WORKER;
/*
* This _must_ happen before we call free_task(), i.e. before we jump
......@@ -2474,6 +2477,34 @@ struct mm_struct *copy_init_mm(void)
return dup_mm(NULL, &init_mm);
}
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
* The returned task is inactive, and the caller must fire it up through
* wake_up_new_task(p). All signals are blocked in the created task.
*/
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
CLONE_IO;
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.stack = (unsigned long)fn,
.stack_size = (unsigned long)arg,
.io_thread = 1,
};
struct task_struct *tsk;
tsk = copy_process(NULL, 0, node, &args);
if (!IS_ERR(tsk)) {
sigfillset(&tsk->blocked);
sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
}
return tsk;
}
/*
* Ok, this is the main fork-routine.
*
......
......@@ -986,6 +986,7 @@ void __rcu_irq_enter_check_tick(void)
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
#endif /* CONFIG_NO_HZ_FULL */
/**
......
......@@ -21,7 +21,7 @@
#include <asm/tlb.h>
#include "../workqueue_internal.h"
#include "../../fs/io-wq.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
#include "pelt.h"
......
......@@ -986,7 +986,7 @@ static inline bool wants_signal(int sig, struct task_struct *p)
if (task_is_stopped_or_traced(p))
return false;
return task_curr(p) || !signal_pending(p);
return task_curr(p) || !task_sigpending(p);
}
static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
......@@ -2526,6 +2526,18 @@ bool get_signal(struct ksignal *ksig)
struct signal_struct *signal = current->signal;
int signr;
/*
* For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
* that the arch handlers don't all have to do it. If we get here
* without TIF_SIGPENDING, just exit after running signal work.
*/
if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
tracehook_notify_signal();
if (!task_sigpending(current))
return false;
}
if (unlikely(uprobe_deny_signal()))
return false;
......@@ -2819,7 +2831,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
/* Remove the signals this thread can handle. */
sigandsets(&retarget, &retarget, &t->blocked);
if (!signal_pending(t))
if (!task_sigpending(t))
signal_wake_up(t, 0);
if (sigisemptyset(&retarget))
......@@ -2853,7 +2865,7 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_end(tsk);
if (!signal_pending(tsk))
if (!task_sigpending(tsk))
goto out;
unblocked = tsk->blocked;
......@@ -2897,7 +2909,7 @@ long do_no_restart_syscall(struct restart_block *param)
static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
if (signal_pending(tsk) && !thread_group_empty(tsk)) {
if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
sigset_t newblocked;
/* A set of now blocked but previously unblocked signals. */
sigandnsets(&newblocked, newset, &current->blocked);
......
......@@ -70,18 +70,17 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
}
/**
* task_work_cancel - cancel a pending work added by task_work_add()
* task_work_cancel_match - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
* @func: identifies the work to remove
*
* Find the last queued pending work with ->func == @func and remove
* it from queue.
* @match: match function to call
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data),
void *data)
{
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
......@@ -97,7 +96,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
while ((work = READ_ONCE(*pprev))) {
if (work->func != func)
if (!match(work, data))
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
break;
......@@ -107,6 +106,28 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
return work;
}
static bool task_work_func_match(struct callback_head *cb, void *data)
{
return cb->func == data;
}
/**
* task_work_cancel - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
* @func: identifies the work to remove
*
* Find the last queued pending work with ->func == @func and remove
* it from queue.
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
{
return task_work_cancel_match(task, task_work_func_match, func);
}
/**
* task_work_run - execute the works added by task_work_add()
*
......
......@@ -1836,24 +1836,38 @@ int import_single_range(int rw, void __user *buf, size_t len,
}
EXPORT_SYMBOL(import_single_range);
int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
int (*f)(struct kvec *vec, void *context),
void *context)
/**
* iov_iter_restore() - Restore a &struct iov_iter to the same state as when
* iov_iter_save_state() was called.
*
* @i: &struct iov_iter to restore
* @state: state to restore from
*
* Used after iov_iter_save_state() to bring restore @i, if operations may
* have advanced it.
*
* Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
*/
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
struct kvec w;
int err = -EINVAL;
if (!bytes)
return 0;
iterate_all_kinds(i, bytes, v, -EINVAL, ({
w.iov_base = kmap(v.bv_page) + v.bv_offset;
w.iov_len = v.bv_len;
err = f(&w, context);
kunmap(v.bv_page);
err;}), ({
w = v;
err = f(&w, context);})
)
return err;
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
!iov_iter_is_kvec(i))
return;
i->iov_offset = state->iov_offset;
i->count = state->count;
/*
* For the *vec iters, nr_segs + iov is constant - if we increment
* the vec, then we also decrement the nr_segs count. Hence we don't
* need to track both of these, just one is enough and we can deduct
* the other from that. ITER_KVEC and ITER_IOVEC are the same struct
* size, so we can just increment the iov pointer as they are unionzed.
* ITER_BVEC _may_ be the same size on some archs, but on others it is
* not. Be safe and handle it separately.
*/
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
if (iov_iter_is_bvec(i))
i->bvec -= state->nr_segs - i->nr_segs;
else
i->iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs;
}
EXPORT_SYMBOL(iov_iter_for_each_range);
......@@ -1427,6 +1427,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev,
free:
kfree(table);
out:
mdev->sysctl = NULL;
return -ENOBUFS;
}
......@@ -1436,6 +1437,9 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev,
struct net *net = dev_net(dev);
struct ctl_table *table;
if (!mdev->sysctl)
return;
table = mdev->sysctl->ctl_table_arg;
unregister_net_sysctl_table(mdev->sysctl);
kfree(table);
......
......@@ -1619,6 +1619,7 @@ static void taprio_reset(struct Qdisc *sch)
int i;
hrtimer_cancel(&q->advance_timer);
if (q->qdiscs) {
for (i = 0; i < dev->num_tx_queues; i++)
if (q->qdiscs[i])
......@@ -1642,6 +1643,7 @@ static void taprio_destroy(struct Qdisc *sch)
* happens in qdisc_create(), after taprio_init() has been called.
*/
hrtimer_cancel(&q->advance_timer);
qdisc_synchronize(sch);
taprio_disable_offload(dev, q, NULL);
......
......@@ -1688,30 +1688,22 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
return __sys_listen(fd, backlog);
}
int __sys_accept4_file(struct file *file, unsigned file_flags,
struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile)
int __user *upeer_addrlen, int flags)
{
struct socket *sock, *newsock;
struct file *newfile;
int err, len, newfd;
int err, len;
struct sockaddr_storage address;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
sock = sock_from_file(file, &err);
if (!sock)
goto out;
return ERR_PTR(err);
err = -ENFILE;
newsock = sock_alloc();
if (!newsock)
goto out;
return ERR_PTR(-ENFILE);
newsock->type = sock->type;
newsock->ops = sock->ops;
......@@ -1722,18 +1714,9 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
*/
__module_get(newsock->ops->owner);
newfd = __get_unused_fd_flags(flags, nofile);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
goto out;
}
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
if (IS_ERR(newfile)) {
err = PTR_ERR(newfile);
put_unused_fd(newfd);
goto out;
}
if (IS_ERR(newfile))
return newfile;
err = security_socket_accept(sock, newsock);
if (err)
......@@ -1758,16 +1741,38 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
}
/* File flags are not inherited via accept() unlike another OSes. */
fd_install(newfd, newfile);
err = newfd;
out:
return err;
return newfile;
out_fd:
fput(newfile);
put_unused_fd(newfd);
goto out;
return ERR_PTR(err);
}
int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile)
{
struct file *newfile;
int newfd;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
newfd = __get_unused_fd_flags(flags, nofile);
if (unlikely(newfd < 0))
return newfd;
newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen,
flags);
if (IS_ERR(newfile)) {
put_unused_fd(newfd);
return PTR_ERR(newfile);
}
fd_install(newfd, newfile);
return newfd;
}
/*
......@@ -2181,6 +2186,17 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
* Shutdown a socket.
*/
int __sys_shutdown_sock(struct socket *sock, int how)
{
int err;
err = security_socket_shutdown(sock, how);
if (!err)
err = sock->ops->shutdown(sock, how);
return err;
}
int __sys_shutdown(int fd, int how)
{
int err, fput_needed;
......@@ -2188,9 +2204,7 @@ int __sys_shutdown(int fd, int how)
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock != NULL) {
err = security_socket_shutdown(sock, how);
if (!err)
err = sock->ops->shutdown(sock, how);
err = __sys_shutdown_sock(sock, how);
fput_light(sock->file, fput_needed);
}
return err;
......
......@@ -421,6 +421,9 @@ extern int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags,
unsigned long nofile);
extern struct file *do_accept(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
......@@ -436,5 +439,6 @@ extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
int __user *usockaddr_len);
extern int __sys_socketpair(int family, int type, int protocol,
int __user *usockvec);
extern int __sys_shutdown_sock(struct socket *sock, int how);
extern int __sys_shutdown(int fd, int how);
#endif /* _LINUX_SOCKET_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册