diff --git a/CREDITS b/CREDITS index 5befd2d714d0037548bed049a979dc4fcee1d300..ed93a49b8cbc9868d9fececcdfb8198e957bdd1d 100644 --- a/CREDITS +++ b/CREDITS @@ -3281,7 +3281,9 @@ S: France N: Aleksa Sarai E: cyphar@cyphar.com W: https://www.cyphar.com/ -D: `pids` cgroup subsystem +D: /sys/fs/cgroup/pids +D: openat2(2) +S: Sydney, Australia N: Dipankar Sarma E: dipankar@in.ibm.com diff --git a/MAINTAINERS b/MAINTAINERS index 3ac378aa5a8581f6f50a913de562c024e2bb9892..c2698684ca7cdb62bc51a90b49c43d1e703a1f31 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5695,6 +5695,7 @@ S: Maintained F: fs/* F: include/linux/fs.h F: include/uapi/linux/fs.h +F: include/uapi/linux/openat2.h FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index a5bce64f540e5fb71ef6745d011a8978855618f4..b9022ebfde17d9a4b461f21a2181fce767ab5d15 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -417,3 +417,4 @@ 425 common io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register +437 common openat2 sys_openat2 diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index b4ad426dc0959b49fddcb76c974c004697051e62..e7bcb929142fa26f4c9b2d62088e58cd431b8dba 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 428 +#define __NR_compat_syscalls 438 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 3e40028c7f81ae6938f614e2c16ede8a827c0de8..ed109c48b5a7146fe242df34f8ab1aa6df09b09c 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -825,6 +825,8 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) #define __NR_io_uring_register 427 __SYSCALL(__NR_io_uring_register, sys_io_uring_register) +#define __NR_openat2 437 +__SYSCALL(__NR_openat2, sys_openat2) /* * Please add new compat syscalls above this comment and update diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index c67da38d5e428079115d8a1c6eaa7f486392eb71..17567ea6a3011b11ea2313fdb124b1e114b41fd5 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -394,3 +394,4 @@ 425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup 426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter 427 common io_uring_register sys_io_uring_register sys_io_uring_register +437 common openat2 sys_openat2 sys_openat2 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 2eefd2a7c1cef0b93723a44d71d811d85cbe8c1f..77c46280c67555a89f7c2cdb19c95f50914fdd79 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -401,3 +401,4 @@ 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter 427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register +427 i386 openat2 sys_openat2 __ia32_sys_openat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 65c026185e612f833daa22523d245d9cae4f8e34..831924d9385c9ac1864d54811db57ba502b0dc80 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -346,6 +346,7 @@ 425 common io_uring_setup __x64_sys_io_uring_setup 426 common io_uring_enter __x64_sys_io_uring_enter 427 common io_uring_register __x64_sys_io_uring_register +437 common openat2 __x64_sys_openat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/open.c b/fs/open.c index 878478745924b8ac6e324ab0bb7b2624a4d6684d..89be988a373fa7326da8db8b943da5f7a156ae60 100644 --- a/fs/open.c +++ b/fs/open.c @@ -946,48 +946,84 @@ struct file *open_with_fake_path(const struct path *path, int flags, } EXPORT_SYMBOL(open_with_fake_path); -static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) +#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) +#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) + +static inline struct open_how build_open_how(int flags, umode_t mode) +{ + struct open_how how = { + .flags = flags & VALID_OPEN_FLAGS, + .mode = mode & S_IALLUGO, + }; + + /* O_PATH beats everything else. */ + if (how.flags & O_PATH) + how.flags &= O_PATH_FLAGS; + /* Modes should only be set for create-like flags. */ + if (!WILL_CREATE(how.flags)) + how.mode = 0; + return how; +} + +static inline int build_open_flags(const struct open_how *how, + struct open_flags *op) { + int flags = how->flags; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); + /* Must never be set by userspace */ + flags &= ~(FMODE_NONOTIFY | O_CLOEXEC); + /* - * Clear out all open flags we don't know about so that we don't report - * them in fcntl(F_GETFD) or similar interfaces. + * Older syscalls implicitly clear all of the invalid flags or argument + * values before calling build_open_flags(), but openat2(2) checks all + * of its arguments. */ - flags &= VALID_OPEN_FLAGS; + if (flags & ~VALID_OPEN_FLAGS) + return -EINVAL; + if (how->resolve & ~VALID_RESOLVE_FLAGS) + return -EINVAL; - if (flags & (O_CREAT | __O_TMPFILE)) - op->mode = (mode & S_IALLUGO) | S_IFREG; - else + /* Deal with the mode. */ + if (WILL_CREATE(flags)) { + if (how->mode & ~S_IALLUGO) + return -EINVAL; + op->mode = how->mode | S_IFREG; + } else { + if (how->mode != 0) + return -EINVAL; op->mode = 0; - - /* Must never be set by userspace */ - flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; + } /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. + * In order to ensure programs get explicit errors when trying to use + * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it + * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we + * have to require userspace to explicitly set it. */ - if (flags & __O_SYNC) - flags |= O_DSYNC; - if (flags & __O_TMPFILE) { if ((flags & O_TMPFILE_MASK) != O_TMPFILE) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; - } else if (flags & O_PATH) { - /* - * If we have O_PATH in the open flag. Then we - * cannot have anything other than the below set of flags - */ - flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + } + if (flags & O_PATH) { + /* O_PATH only permits certain other flags to be set. */ + if (flags & ~O_PATH_FLAGS) + return -EINVAL; acc_mode = 0; } + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ @@ -1013,6 +1049,18 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + + if (how->resolve & RESOLVE_NO_XDEV) + lookup_flags |= LOOKUP_NO_XDEV; + if (how->resolve & RESOLVE_NO_MAGICLINKS) + lookup_flags |= LOOKUP_NO_MAGICLINKS; + if (how->resolve & RESOLVE_NO_SYMLINKS) + lookup_flags |= LOOKUP_NO_SYMLINKS; + if (how->resolve & RESOLVE_BENEATH) + lookup_flags |= LOOKUP_BENEATH; + if (how->resolve & RESOLVE_IN_ROOT) + lookup_flags |= LOOKUP_IN_ROOT; + op->lookup_flags = lookup_flags; return 0; } @@ -1031,8 +1079,11 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); - return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); + if (err) + return ERR_PTR(err); + return do_filp_open(AT_FDCWD, name, &op); } /** @@ -1063,17 +1114,19 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +static long do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(flags, mode, &op); + int fd = build_open_flags(how, &op); struct filename *tmp; if (fd) @@ -1083,7 +1136,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) if (IS_ERR(tmp)) return PTR_ERR(tmp); - fd = get_unused_fd_flags(flags); + fd = get_unused_fd_flags(how->flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { @@ -1098,12 +1151,16 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) return fd; } -SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { - if (force_o_largefile()) - flags |= O_LARGEFILE; + struct open_how how = build_open_how(flags, mode); + return do_sys_openat2(dfd, filename, &how); +} - return do_sys_open(AT_FDCWD, filename, flags, mode); + +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + return ksys_open(filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, @@ -1111,10 +1168,32 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, { if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); } +SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, + struct open_how __user *, how, size_t, usize) +{ + int err; + struct open_how tmp; + + BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); + + if (unlikely(usize < OPEN_HOW_SIZE_VER0)) + return -EINVAL; + + err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); + if (err) + return err; + + /* O_LARGEFILE is only allowed for non-O_PATH. */ + if (!(tmp.flags & O_PATH) && force_o_largefile()) + tmp.flags |= O_LARGEFILE; + + return do_sys_openat2(dfd, filename, &tmp); +} + #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index 27dc7a60693e15bca6b2394381870f5d427c7fd2..8d01591456733187841b4423ac76a09935acab93 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -2,15 +2,29 @@ #ifndef _LINUX_FCNTL_H #define _LINUX_FCNTL_H +#include #include -/* list of all valid flags for the open/openat flags argument: */ +/* List of all valid flags for the open/openat flags argument: */ #define VALID_OPEN_FLAGS \ (O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \ O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \ FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \ O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE) +/* List of all valid flags for the how->upgrade_mask argument: */ +#define VALID_UPGRADE_FLAGS \ + (UPGRADE_NOWRITE | UPGRADE_NOREAD) + +/* List of all valid flags for the how->resolve argument: */ +#define VALID_RESOLVE_FLAGS \ + (RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \ + RESOLVE_BENEATH | RESOLVE_IN_ROOT) + +/* List of all open_how "versions". */ +#define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */ +#define OPEN_HOW_SIZE_LATEST OPEN_HOW_SIZE_VER0 + #ifndef force_o_largefile #define force_o_largefile() (BITS_PER_LONG != 32) #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a49a526740d73ef370e7f293c4476aa0e09c58bd..3915a3d466df7ed1983a351ce83dc9c1fb175415 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -70,6 +70,7 @@ struct sigaltstack; struct rseq; union bpf_attr; struct io_uring_params; +struct open_how; #include #include @@ -429,6 +430,8 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); +asmlinkage long sys_openat2(int dfd, const char __user *filename, + struct open_how *how, size_t size); asmlinkage long sys_close(unsigned int fd); asmlinkage long sys_vhangup(void); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 7edead1eaff46b306114609b9e98ed31f4254a61..1104c817fe6542502230cd8c3207a6fbcf92b25a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -743,8 +743,11 @@ __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) #define __NR_io_uring_register 427 __SYSCALL(__NR_io_uring_register, sys_io_uring_register) +#define __NR_openat2 437 +__SYSCALL(__NR_openat2, sys_openat2) + #undef __NR_syscalls -#define __NR_syscalls 428 +#define __NR_syscalls 438 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6448cdd9a350d3a0c6513c7a76aad29a68689bd4..1d867375869c093678d910efdd221f51b5a5c556 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -3,6 +3,7 @@ #define _UAPI_LINUX_FCNTL_H #include +#include #define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) #define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) @@ -90,5 +91,4 @@ #define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ #define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ - #endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h new file mode 100644 index 0000000000000000000000000000000000000000..58b1eb71136007727e8535eaeb19e2ea4a00fb9a --- /dev/null +++ b/include/uapi/linux/openat2.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_OPENAT2_H +#define _UAPI_LINUX_OPENAT2_H + +#include + +/* + * Arguments for how openat2(2) should open the target path. If only @flags and + * @mode are non-zero, then openat2(2) operates very similarly to openat(2). + * + * However, unlike openat(2), unknown or invalid bits in @flags result in + * -EINVAL rather than being silently ignored. @mode must be zero unless one of + * {O_CREAT, O_TMPFILE} are set. + * + * @flags: O_* flags. + * @mode: O_CREAT/O_TMPFILE file mode. + * @resolve: RESOLVE_* flags. + */ +struct open_how { + __u64 flags; + __u64 mode; + __u64 resolve; +}; + +/* how->resolve flags for openat2(2). */ +#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings + (includes bind-mounts). */ +#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style + "magic-links". */ +#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks + (implies OEXT_NO_MAGICLINKS) */ +#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like + "..", symlinks, and absolute + paths which escape the dirfd. */ +#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." + be scoped inside the dirfd + (similar to chroot(2)). */ + +#endif /* _UAPI_LINUX_OPENAT2_H */