diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt index 90912925425e43bffea3244f671c9573ceb34f41..70a3c94d19413b33d73b226e1b091698950343fe 100644 --- a/Documentation/vm/userfaultfd.txt +++ b/Documentation/vm/userfaultfd.txt @@ -46,11 +46,13 @@ is a corner case that would currently return -EBUSY). When first opened the userfaultfd must be enabled invoking the UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or a later API version) which will specify the read/POLLIN protocol -userland intends to speak on the UFFD. The UFFDIO_API ioctl if -successful (i.e. if the requested uffdio_api.api is spoken also by the -running kernel), will return into uffdio_api.features and -uffdio_api.ioctls two 64bit bitmasks of respectively the activated -feature of the read(2) protocol and the generic ioctl available. +userland intends to speak on the UFFD and the uffdio_api.features +userland requires. The UFFDIO_API ioctl if successful (i.e. if the +requested uffdio_api.api is spoken also by the running kernel and the +requested features are going to be enabled) will return into +uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of +respectively all the available features of the read(2) protocol and +the generic ioctl available. Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should be invoked (if present in the returned uffdio_api.ioctls bitmask) to diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0756d97b0666940ac96db0a9cbb8f9ba918cb979..1f2ddaaf3c03bd02ad43a4d20dfac1e0fe48ed04 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -50,7 +50,7 @@ struct userfaultfd_ctx { }; struct userfaultfd_wait_queue { - unsigned long address; + struct uffd_msg msg; wait_queue_t wq; bool pending; struct userfaultfd_ctx *ctx; @@ -77,7 +77,8 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, /* len == 0 means wake all */ start = range->start; len = range->len; - if (len && (start > uwq->address || start + len <= uwq->address)) + if (len && (start > uwq->msg.arg.pagefault.address || + start + len <= uwq->msg.arg.pagefault.address)) goto out; ret = wake_up_state(wq->private, mode); if (ret) @@ -135,28 +136,43 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) } } -static inline unsigned long userfault_address(unsigned long address, - unsigned int flags, - unsigned long reason) +static inline void msg_init(struct uffd_msg *msg) { - BUILD_BUG_ON(PAGE_SHIFT < UFFD_BITS); - address &= PAGE_MASK; + BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); + /* + * Must use memset to zero out the paddings or kernel data is + * leaked to userland. + */ + memset(msg, 0, sizeof(struct uffd_msg)); +} + +static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned int flags, + unsigned long reason) +{ + struct uffd_msg msg; + msg_init(&msg); + msg.event = UFFD_EVENT_PAGEFAULT; + msg.arg.pagefault.address = address; if (flags & FAULT_FLAG_WRITE) /* - * Encode "write" fault information in the LSB of the - * address read by userland, without depending on - * FAULT_FLAG_WRITE kernel internal value. + * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the + * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE + * was not set in a UFFD_EVENT_PAGEFAULT, it means it + * was a read fault, otherwise if set it means it's + * a write fault. */ - address |= UFFD_BIT_WRITE; + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; if (reason & VM_UFFD_WP) /* - * Encode "reason" fault information as bit number 1 - * in the address read by userland. If bit number 1 is - * clear it means the reason is a VM_FAULT_MISSING - * fault. + * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the + * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was + * not set in a UFFD_EVENT_PAGEFAULT, it means it was + * a missing fault, otherwise if set it means it's a + * write protect fault. */ - address |= UFFD_BIT_WP; - return address; + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + return msg; } /* @@ -242,7 +258,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.address = userfault_address(address, flags, reason); + uwq.msg = userfault_msg(address, flags, reason); uwq.pending = true; uwq.ctx = ctx; @@ -398,7 +414,7 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) } static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, - __u64 *addr) + struct uffd_msg *msg) { ssize_t ret; DECLARE_WAITQUEUE(wait, current); @@ -416,8 +432,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, * disappear from under us. */ uwq->pending = false; - /* careful to always initialize addr if ret == 0 */ - *addr = uwq->address; + /* careful to always initialize msg if ret == 0 */ + *msg = uwq->msg; spin_unlock(&ctx->fault_wqh.lock); ret = 0; break; @@ -447,8 +463,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, { struct userfaultfd_ctx *ctx = file->private_data; ssize_t _ret, ret = 0; - /* careful to always initialize addr if ret == 0 */ - __u64 uninitialized_var(addr); + struct uffd_msg msg; int no_wait = file->f_flags & O_NONBLOCK; if (ctx->state == UFFD_STATE_WAIT_API) @@ -456,16 +471,16 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, BUG_ON(ctx->state != UFFD_STATE_RUNNING); for (;;) { - if (count < sizeof(addr)) + if (count < sizeof(msg)) return ret ? ret : -EINVAL; - _ret = userfaultfd_ctx_read(ctx, no_wait, &addr); + _ret = userfaultfd_ctx_read(ctx, no_wait, &msg); if (_ret < 0) return ret ? ret : _ret; - if (put_user(addr, (__u64 __user *) buf)) + if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) return ret ? ret : -EFAULT; - ret += sizeof(addr); - buf += sizeof(addr); - count -= sizeof(addr); + ret += sizeof(msg); + buf += sizeof(msg); + count -= sizeof(msg); /* * Allow to read more than one fault at time but only * block if waiting for the very first one. @@ -873,17 +888,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, if (ctx->state != UFFD_STATE_WAIT_API) goto out; ret = -EFAULT; - if (copy_from_user(&uffdio_api, buf, sizeof(__u64))) + if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - if (uffdio_api.api != UFFD_API) { - /* careful not to leak info, we only read the first 8 bytes */ + if (uffdio_api.api != UFFD_API || uffdio_api.features) { memset(&uffdio_api, 0, sizeof(uffdio_api)); if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ret = -EINVAL; goto out; } - /* careful not to leak info, we only read the first 8 bytes */ uffdio_api.features = UFFD_API_FEATURES; uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 330206016249c541db4b90c6b1d6591166324e44..a5f8825381ef07f07ef63d346f5d30c807428266 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -11,9 +11,15 @@ #include +#include + #define UFFD_API ((__u64)0xAA) -/* FIXME: add "|UFFD_FEATURE_WP" to UFFD_API_FEATURES after implementing it */ -#define UFFD_API_FEATURES (UFFD_FEATURE_WRITE_BIT) +/* + * After implementing the respective features it will become: + * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + * UFFD_FEATURE_EVENT_FORK) + */ +#define UFFD_API_FEATURES (0) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -45,26 +51,60 @@ #define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ struct uffdio_range) -/* - * Valid bits below PAGE_SHIFT in the userfault address read through - * the read() syscall. - */ -#define UFFD_BIT_WRITE (1<<0) /* this was a write fault, MISSING or WP */ -#define UFFD_BIT_WP (1<<1) /* handle_userfault() reason VM_UFFD_WP */ -#define UFFD_BITS 2 /* two above bits used for UFFD_BIT_* mask */ +/* read() structure */ +struct uffd_msg { + __u8 event; + + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + + union { + struct { + __u64 flags; + __u64 address; + } pagefault; + + struct { + /* unused reserved fields */ + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; + } reserved; + } arg; +} __packed; /* - * Features reported in uffdio_api.features field + * Start at 0x12 and not at 0 to be more strict against bugs. */ -#define UFFD_FEATURE_WRITE_BIT (1<<0) /* Corresponds to UFFD_BIT_WRITE */ -#define UFFD_FEATURE_WP_BIT (1<<1) /* Corresponds to UFFD_BIT_WP */ +#define UFFD_EVENT_PAGEFAULT 0x12 +#if 0 /* not available yet */ +#define UFFD_EVENT_FORK 0x13 +#endif + +/* flags for UFFD_EVENT_PAGEFAULT */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ struct uffdio_api { - /* userland asks for an API number */ + /* userland asks for an API number and the features to enable */ __u64 api; - - /* kernel answers below with the available features for the API */ + /* + * Kernel answers below with the all available features for + * the API, this notifies userland of which events and/or + * which flags for each event are enabled in the current + * kernel. + * + * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE + * are to be considered implicitly always enabled in all kernels as + * long as the uffdio_api.api requested matches UFFD_API. + */ +#if 0 /* not available yet */ +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#endif __u64 features; + __u64 ioctls; };