提交 b349b118 编写于 作者: L Linus Torvalds

Merge tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - As per (valid) complaint in the last merge window, fs/io_uring.c has
   grown quite large these days. io_uring isn't really tied to fs
   either, as it supports a wide variety of functionality outside of
   that.

   Move the code to io_uring/ and split it into files that either
   implement a specific request type, and split some code into helpers
   as well. The code is organized a lot better like this, and io_uring.c
   is now < 4K LOC (me).

 - Deprecate the epoll_ctl opcode. It'll still work, just trigger a
   warning once if used. If we don't get any complaints on this, and I
   don't expect any, then we can fully remove it in a future release
   (me).

 - Improve the cancel hash locking (Hao)

 - kbuf cleanups (Hao)

 - Efficiency improvements to the task_work handling (Dylan, Pavel)

 - Provided buffer improvements (Dylan)

 - Add support for recv/recvmsg multishot support. This is similar to
   the accept (or poll) support for have for multishot, where a single
   SQE can trigger everytime data is received. For applications that
   expect to do more than a few receives on an instantiated socket, this
   greatly improves efficiency (Dylan).

 - Efficiency improvements for poll handling (Pavel)

 - Poll cancelation improvements (Pavel)

 - Allow specifiying a range for direct descriptor allocations (Pavel)

 - Cleanup the cqe32 handling (Pavel)

 - Move io_uring types to greatly cleanup the tracing (Pavel)

 - Tons of great code cleanups and improvements (Pavel)

 - Add a way to do sync cancelations rather than through the sqe -> cqe
   interface, as that's a lot easier to use for some use cases (me).

 - Add support to IORING_OP_MSG_RING for sending direct descriptors to a
   different ring. This avoids the usually problematic SCM case, as we
   disallow those. (me)

 - Make the per-command alloc cache we use for apoll generic, place
   limits on it, and use it for netmsg as well (me).

 - Various cleanups (me, Michal, Gustavo, Uros)

* tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block: (172 commits)
  io_uring: ensure REQ_F_ISREG is set async offload
  net: fix compat pointer in get_compat_msghdr()
  io_uring: Don't require reinitable percpu_ref
  io_uring: fix types in io_recvmsg_multishot_overflow
  io_uring: Use atomic_long_try_cmpxchg in __io_account_mem
  io_uring: support multishot in recvmsg
  net: copy from user before calling __get_compat_msghdr
  net: copy from user before calling __copy_msghdr
  io_uring: support 0 length iov in buffer select in compat
  io_uring: fix multishot ending when not polled
  io_uring: add netmsg cache
  io_uring: impose max limit on apoll cache
  io_uring: add abstraction around apoll cache
  io_uring: move apoll cache to poll.c
  io_uring: consolidate hash_locked io-wq handling
  io_uring: clear REQ_F_HASH_LOCKED on hash removal
  io_uring: don't race double poll setting REQ_F_ASYNC_DATA
  io_uring: don't miss setting REQ_F_DOUBLE_POLL
  io_uring: disable multishot recvmsg
  io_uring: only trace one of complete or overflow
  ...
......@@ -7811,9 +7811,6 @@ F: include/linux/fs.h
F: include/linux/fs_types.h
F: include/uapi/linux/fs.h
F: include/uapi/linux/openat2.h
X: fs/io-wq.c
X: fs/io-wq.h
X: fs/io_uring.c
FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
M: Riku Voipio <riku.voipio@iki.fi>
......@@ -10521,9 +10518,7 @@ L: io-uring@vger.kernel.org
S: Maintained
T: git git://git.kernel.dk/linux-block
T: git git://git.kernel.dk/liburing
F: fs/io-wq.c
F: fs/io-wq.h
F: fs/io_uring.c
F: io_uring/
F: include/linux/io_uring.h
F: include/uapi/linux/io_uring.h
F: tools/io_uring/
......
......@@ -1097,6 +1097,7 @@ export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
core-$(CONFIG_BLOCK) += block/
core-$(CONFIG_IO_URING) += io_uring/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
......
......@@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
......
此差异已折叠。
#ifndef IO_URING_TYPES_H
#define IO_URING_TYPES_H
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/bitmap.h>
#include <uapi/linux/io_uring.h>
struct io_wq_work_node {
struct io_wq_work_node *next;
};
struct io_wq_work_list {
struct io_wq_work_node *first;
struct io_wq_work_node *last;
};
struct io_wq_work {
struct io_wq_work_node list;
unsigned flags;
/* place it here instead of io_kiocb as it fills padding and saves 4B */
int cancel_seq;
};
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
};
struct io_file_table {
struct io_fixed_file *files;
unsigned long *bitmap;
unsigned int alloc_hint;
};
struct io_hash_bucket {
spinlock_t lock;
struct hlist_head list;
} ____cacheline_aligned_in_smp;
struct io_hash_table {
struct io_hash_bucket *hbs;
unsigned hash_bits;
};
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
};
/*
* This data is shared with the application through the mmap at offsets
* IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
*
* The offsets to the member fields are published through struct
* io_sqring_offsets when calling io_uring_setup.
*/
struct io_rings {
/*
* Head and tail offsets into the ring; the offsets need to be
* masked to get valid indices.
*
* The kernel controls head of the sq ring and the tail of the cq ring,
* and the application controls tail of the sq ring and the head of the
* cq ring.
*/
struct io_uring sq, cq;
/*
* Bitmasks to apply to head and tail offsets (constant, equals
* ring_entries - 1)
*/
u32 sq_ring_mask, cq_ring_mask;
/* Ring sizes (constant, power of 2) */
u32 sq_ring_entries, cq_ring_entries;
/*
* Number of invalid entries dropped by the kernel due to
* invalid index stored in array
*
* Written by the kernel, shouldn't be modified by the
* application (i.e. get number of "new events" by comparing to
* cached value).
*
* After a new SQ head value was read by the application this
* counter includes all submissions that were dropped reaching
* the new SQ head (and possibly more).
*/
u32 sq_dropped;
/*
* Runtime SQ flags
*
* Written by the kernel, shouldn't be modified by the
* application.
*
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
atomic_t sq_flags;
/*
* Runtime CQ flags
*
* Written by the application, shouldn't be modified by the
* kernel.
*/
u32 cq_flags;
/*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
* there are not more requests pending than there is space in
* the completion queue.
*
* Written by the kernel, shouldn't be modified by the
* application (i.e. get number of "new events" by comparing to
* cached value).
*
* As completion events come in out of order this counter is not
* ordered with any other data.
*/
u32 cq_overflow;
/*
* Ring buffer of completion events.
*
* The kernel writes completion events fresh every time they are
* produced, so the application is allowed to modify pending
* entries.
*/
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
struct io_restriction {
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
u8 sqe_flags_allowed;
u8 sqe_flags_required;
bool registered;
};
struct io_submit_link {
struct io_kiocb *head;
struct io_kiocb *last;
};
struct io_submit_state {
/* inline/task_work completion list, under ->uring_lock */
struct io_wq_work_node free_list;
/* batch completion logic */
struct io_wq_work_list compl_reqs;
struct io_submit_link link;
bool plug_started;
bool need_plug;
unsigned short submit_nr;
struct blk_plug plug;
};
struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
};
struct io_alloc_cache {
struct hlist_head list;
unsigned int nr_cached;
};
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
struct percpu_ref refs;
struct io_rings *rings;
unsigned int flags;
enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
unsigned int has_evfd: 1;
unsigned int syscall_iopoll: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
struct {
struct mutex uring_lock;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
* mmapped by the application using the IORING_OFF_SQES offset.
*
* This indirection could e.g. be used to assign fixed
* io_uring_sqe entries to operations and only submit them to
* the queue when needed.
*
* The kernel modifies neither the indices array nor the entries
* array.
*/
u32 *sq_array;
struct io_uring_sqe *sq_sqes;
unsigned cached_sq_head;
unsigned sq_entries;
/*
* Fixed resources fast path, should be accessed only under
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
int rsrc_cached_refs;
atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
struct io_buffer_list *io_bl;
struct xarray io_bl_xa;
struct list_head io_buffers_cache;
struct io_hash_table cancel_table_locked;
struct list_head cq_overflow_list;
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
unsigned long check_cq;
unsigned int file_alloc_start;
unsigned int file_alloc_end;
struct xarray personalities;
u32 pers_next;
struct {
/*
* We cache a range of free CQEs we can use, once exhausted it
* should go through a slower range setup, see __io_get_cqe()
*/
struct io_uring_cqe *cqe_cached;
struct io_uring_cqe *cqe_sentinel;
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
} ____cacheline_aligned_in_smp;
struct {
spinlock_t completion_lock;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
struct io_hash_table cancel_table;
bool poll_multi_queue;
struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
/* timeouts */
struct {
spinlock_t timeout_lock;
atomic_t cq_timeouts;
struct list_head timeout_list;
struct list_head ltimeout_list;
unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;
/* Keep this last, we don't need it for the fast path */
struct io_restriction restrictions;
struct task_struct *submitter_task;
/* slow path rsrc auxilary data, used by update/register */
struct io_rsrc_node *rsrc_backup_node;
struct io_mapped_ubuf *dummy_ubuf;
struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data;
struct delayed_work rsrc_put_work;
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
struct list_head io_buffers_pages;
#if defined(CONFIG_UNIX)
struct socket *ring_sock;
#endif
/* hashed buffered write serialization */
struct io_wq_hash *hash_map;
/* Only used for accounting purposes */
struct user_struct *user;
struct mm_struct *mm_account;
/* ctx exit and cancelation */
struct llist_head fallback_llist;
struct delayed_work fallback_work;
struct work_struct exit_work;
struct list_head tctx_list;
struct completion ref_comp;
/* io-wq management, e.g. thread count */
u32 iowq_limits[2];
bool iowq_limits_set;
struct list_head defer_list;
unsigned sq_thread_idle;
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;
};
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
/* first byte is taken by user flags, shift it to not overlap */
REQ_F_FAIL_BIT = 8,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_BUFFER_RING_BIT,
REQ_F_REISSUE_BIT,
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
REQ_F_ASYNC_DATA_BIT,
REQ_F_SKIP_LINK_CQES_BIT,
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
REQ_F_CQE32_INIT_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
REQ_F_HASH_LOCKED_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
};
enum {
/* ctx owns file */
REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
/* drain existing IO first */
REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
/* linked sqes */
REQ_F_LINK = BIT(REQ_F_LINK_BIT),
/* doesn't sever on completion < 0 */
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
/* IOSQE_CQE_SKIP_SUCCESS */
REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
/* fail rest of links */
REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
/* on inflight list, should be cancelled and waited on exit reliably */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has or had linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* buffer selected from ring, needs commit */
REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
/* supports async reads/writes */
REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
/* skip refcounting if not set */
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
/* ->async_data allocated */
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
/* don't post CQEs while failing linked requests */
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
/* single poll may be active */
REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
/* double poll may active */
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
/* request has already done partial IO */
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
/* ->extra1 and ->extra2 are initialised */
REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
/* recvmsg special flag, clear EPOLLIN */
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
struct io_task_work {
struct llist_node node;
io_req_tw_func_t func;
};
struct io_cqe {
__u64 user_data;
__s32 res;
/* fd initially, then cflags for completion */
union {
__u32 flags;
int fd;
};
};
/*
* Each request type overlays its private data structure on top of this one.
* They must not exceed this one in size.
*/
struct io_cmd_data {
struct file *file;
/* each command gets 56 bytes of data */
__u8 data[56];
};
#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd)
#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr)
struct io_kiocb {
union {
/*
* NOTE! Each of the io_kiocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
* or directly as just 'file' in this struct.
*/
struct file *file;
struct io_cmd_data cmd;
};
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
/*
* Can be either a fixed buffer index, or used with provided buffers.
* For the latter, before issue it points to the buffer group ID,
* and after selection it points to the buffer ID itself.
*/
u16 buf_index;
unsigned int flags;
struct io_cqe cqe;
struct io_ring_ctx *ctx;
struct task_struct *task;
struct io_rsrc_node *rsrc_node;
union {
/* store used ubuf, so we can prevent reloading */
struct io_mapped_ubuf *imu;
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
struct io_buffer *kbuf;
/*
* stores buffer ID for ring provided buffers, valid IFF
* REQ_F_BUFFER_RING is set.
*/
struct io_buffer_list *buf_list;
};
union {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
/* cache ->apoll->events */
__poll_t apoll_events;
};
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
union {
struct hlist_node hash_node;
struct {
u64 extra1;
u64 extra2;
};
};
/* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
struct io_kiocb *link;
/* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
struct io_wq_work work;
};
struct io_overflow_cqe {
struct list_head list;
struct io_uring_cqe cqe;
};
#endif
......@@ -416,10 +416,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr,
struct iovec **iov);
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs);
extern int __copy_msghdr(struct msghdr *kmsg,
struct user_msghdr *umsg,
struct sockaddr __user **save_addr);
/* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
......
......@@ -46,9 +46,8 @@ struct compat_rtentry {
unsigned short rt_irtt; /* Initial RTT */
};
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr, compat_uptr_t *ptr,
compat_size_t *len);
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg,
struct sockaddr __user **save_addr);
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
struct sockaddr __user **, struct iovec **);
int put_cmsg_compat(struct msghdr*, int, int, int, void *);
......
......@@ -7,6 +7,7 @@
#include <linux/tracepoint.h>
#include <uapi/linux/io_uring.h>
#include <linux/io_uring_types.h>
#include <linux/io_uring.h>
struct io_wq_work;
......@@ -97,9 +98,7 @@ TRACE_EVENT(io_uring_register,
/**
* io_uring_file_get - called before getting references to an SQE file
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @fd: SQE file descriptor
*
* Allows to trace out how often an SQE file reference is obtained, which can
......@@ -108,9 +107,9 @@ TRACE_EVENT(io_uring_register,
*/
TRACE_EVENT(io_uring_file_get,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd),
TP_PROTO(struct io_kiocb *req, int fd),
TP_ARGS(ctx, req, user_data, fd),
TP_ARGS(req, fd),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -120,9 +119,9 @@ TRACE_EVENT(io_uring_file_get,
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->user_data = req->cqe.user_data;
__entry->fd = fd;
),
......@@ -133,22 +132,16 @@ TRACE_EVENT(io_uring_file_get,
/**
* io_uring_queue_async_work - called before submitting a new async work
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @flags request flags
* @work: pointer to a submitted io_wq_work
* @rw: type of workqueue, hashed or normal
*
* Allows to trace asynchronous work submission.
*/
TRACE_EVENT(io_uring_queue_async_work,
TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode,
unsigned int flags, struct io_wq_work *work, int rw),
TP_PROTO(struct io_kiocb *req, int rw),
TP_ARGS(ctx, req, user_data, opcode, flags, work, rw),
TP_ARGS(req, rw),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -159,19 +152,19 @@ TRACE_EVENT(io_uring_queue_async_work,
__field( struct io_wq_work *, work )
__field( int, rw )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->flags = flags;
__entry->opcode = opcode;
__entry->work = work;
__entry->user_data = req->cqe.user_data;
__entry->flags = req->flags;
__entry->opcode = req->opcode;
__entry->work = &req->work;
__entry->rw = rw;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
......@@ -183,19 +176,16 @@ TRACE_EVENT(io_uring_queue_async_work,
/**
* io_uring_defer - called when an io_uring request is deferred
*
* @ctx: pointer to a ring context structure
* @req: pointer to a deferred request
* @user_data: user data associated with the request
* @opcode: opcode of request
*
* Allows to track deferred requests, to get an insight about what requests are
* not started immediately.
*/
TRACE_EVENT(io_uring_defer,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode),
TP_PROTO(struct io_kiocb *req),
TP_ARGS(ctx, req, user_data, opcode),
TP_ARGS(req),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -203,16 +193,16 @@ TRACE_EVENT(io_uring_defer,
__field( unsigned long long, data )
__field( u8, opcode )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->data = user_data;
__entry->opcode = opcode;
__entry->data = req->cqe.user_data;
__entry->opcode = req->opcode;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
......@@ -224,7 +214,6 @@ TRACE_EVENT(io_uring_defer,
* io_uring_link - called before the io_uring request added into link_list of
* another request
*
* @ctx: pointer to a ring context structure
* @req: pointer to a linked request
* @target_req: pointer to a previous request, that would contain @req
*
......@@ -233,9 +222,9 @@ TRACE_EVENT(io_uring_defer,
*/
TRACE_EVENT(io_uring_link,
TP_PROTO(void *ctx, void *req, void *target_req),
TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req),
TP_ARGS(ctx, req, target_req),
TP_ARGS(req, target_req),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -244,7 +233,7 @@ TRACE_EVENT(io_uring_link,
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->target_req = target_req;
),
......@@ -285,10 +274,7 @@ TRACE_EVENT(io_uring_cqring_wait,
/**
* io_uring_fail_link - called before failing a linked request
*
* @ctx: pointer to a ring context structure
* @req: request, which links were cancelled
* @user_data: user data associated with the request
* @opcode: opcode of request
* @link: cancelled link
*
* Allows to track linked requests cancellation, to see not only that some work
......@@ -296,9 +282,9 @@ TRACE_EVENT(io_uring_cqring_wait,
*/
TRACE_EVENT(io_uring_fail_link,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link),
TP_PROTO(struct io_kiocb *req, struct io_kiocb *link),
TP_ARGS(ctx, req, user_data, opcode, link),
TP_ARGS(req, link),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -307,17 +293,17 @@ TRACE_EVENT(io_uring_fail_link,
__field( u8, opcode )
__field( void *, link )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->link = link;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
......@@ -376,23 +362,17 @@ TRACE_EVENT(io_uring_complete,
/**
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @flags request flags
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
*
* Allows to track SQE submitting, to understand what was the source of it, SQ
* thread or io_uring_enter call.
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags,
bool force_nonblock, bool sq_thread),
TP_PROTO(struct io_kiocb *req, bool force_nonblock),
TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread),
TP_ARGS(req, force_nonblock),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -403,19 +383,19 @@ TRACE_EVENT(io_uring_submit_sqe,
__field( bool, force_nonblock )
__field( bool, sq_thread )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->flags = flags;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->flags = req->flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
......@@ -427,10 +407,7 @@ TRACE_EVENT(io_uring_submit_sqe,
/*
* io_uring_poll_arm - called after arming a poll wait if successful
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @mask: request poll events mask
* @events: registered events of interest
*
......@@ -439,10 +416,9 @@ TRACE_EVENT(io_uring_submit_sqe,
*/
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode,
int mask, int events),
TP_PROTO(struct io_kiocb *req, int mask, int events),
TP_ARGS(ctx, req, user_data, opcode, mask, events),
TP_ARGS(req, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -452,18 +428,18 @@ TRACE_EVENT(io_uring_poll_arm,
__field( int, mask )
__field( int, events )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->mask = mask;
__entry->events = events;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
......@@ -475,18 +451,15 @@ TRACE_EVENT(io_uring_poll_arm,
/*
* io_uring_task_add - called after adding a task
*
* @ctx: pointer to a ring context structure
* @req: pointer to request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @mask: request poll events mask
*
*/
TRACE_EVENT(io_uring_task_add,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask),
TP_PROTO(struct io_kiocb *req, int mask),
TP_ARGS(ctx, req, user_data, opcode, mask),
TP_ARGS(req, mask),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -495,17 +468,17 @@ TRACE_EVENT(io_uring_task_add,
__field( u8, opcode )
__field( int, mask )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->mask = mask;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
......@@ -518,7 +491,6 @@ TRACE_EVENT(io_uring_task_add,
* io_uring_req_failed - called when an sqe is errored dring submission
*
* @sqe: pointer to the io_uring_sqe that failed
* @ctx: pointer to a ring context structure
* @req: pointer to request
* @error: error it failed with
*
......@@ -526,9 +498,9 @@ TRACE_EVENT(io_uring_task_add,
*/
TRACE_EVENT(io_uring_req_failed,
TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error),
TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error),
TP_ARGS(sqe, ctx, req, error),
TP_ARGS(sqe, req, error),
TP_STRUCT__entry (
__field( void *, ctx )
......@@ -552,7 +524,7 @@ TRACE_EVENT(io_uring_req_failed,
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = sqe->user_data;
__entry->opcode = sqe->opcode;
......@@ -622,12 +594,42 @@ TRACE_EVENT(io_uring_cqe_overflow,
__entry->ocqe = ocqe;
),
TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, "
"overflow_cqe %p",
__entry->ctx, __entry->user_data, __entry->res,
__entry->cflags, __entry->ocqe)
);
/*
* io_uring_task_work_run - ran task work
*
* @tctx: pointer to a io_uring_task
* @count: how many functions it ran
* @loops: how many loops it ran
*
*/
TRACE_EVENT(io_uring_task_work_run,
TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
TP_ARGS(tctx, count, loops),
TP_STRUCT__entry (
__field( void *, tctx )
__field( unsigned int, count )
__field( unsigned int, loops )
),
TP_fast_assign(
__entry->tctx = tctx;
__entry->count = count;
__entry->loops = loops;
),
TP_printk("tctx %p, count %u, loops %u",
__entry->tctx, __entry->count, __entry->loops)
);
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
......
......@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/time_types.h>
/*
* IO submission data structure (Submission Queue Entry)
......@@ -50,6 +51,7 @@ struct io_uring_sqe {
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
__u32 msg_ring_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
......@@ -140,9 +142,12 @@ enum {
* IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
/*
* Only one task is allowed to submit requests
*/
#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
enum io_uring_op {
IORING_OP_NOP,
......@@ -229,10 +234,13 @@ enum io_uring_op {
*
* IORING_POLL_UPDATE Update existing poll request, matching
* sqe->addr as the old user_data field.
*
* IORING_POLL_LEVEL Level triggered poll.
*/
#define IORING_POLL_ADD_MULTI (1U << 0)
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
#define IORING_POLL_ADD_LEVEL (1U << 3)
/*
* ASYNC_CANCEL flags.
......@@ -241,10 +249,12 @@ enum io_uring_op {
* IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the
* request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request
* IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
*/
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
/*
* send/sendmsg and recv/recvmsg flags (sqe->ioprio)
......@@ -253,14 +263,35 @@ enum io_uring_op {
* or receive and arm poll if that yields an
* -EAGAIN result, arm poll upfront and skip
* the initial transfer attempt.
*
* IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
* the handler will continue to report
* CQEs on behalf of the same SQE.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
#define IORING_RECV_MULTISHOT (1U << 1)
/*
* accept flags stored in sqe->ioprio
*/
#define IORING_ACCEPT_MULTISHOT (1U << 0)
/*
* IORING_OP_MSG_RING command types, stored in sqe->addr
*/
enum {
IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
IORING_MSG_SEND_FD, /* send a registered fd to another ring */
};
/*
* IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
*
* IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
* applicable for IORING_MSG_DATA, obviously.
*/
#define IORING_MSG_RING_CQE_SKIP (1U << 0)
/*
* IO completion data structure (Completion Queue Entry)
*/
......@@ -420,6 +451,12 @@ enum {
IORING_REGISTER_PBUF_RING = 22,
IORING_UNREGISTER_PBUF_RING = 23,
/* sync cancelation API */
IORING_REGISTER_SYNC_CANCEL = 24,
/* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
/* this goes last */
IORING_REGISTER_LAST
};
......@@ -483,7 +520,7 @@ struct io_uring_probe {
__u8 ops_len; /* length of ops[] array below */
__u16 resv;
__u32 resv2[3];
struct io_uring_probe_op ops[0];
struct io_uring_probe_op ops[];
};
struct io_uring_restriction {
......@@ -555,4 +592,32 @@ struct io_uring_getevents_arg {
__u64 ts;
};
/*
* Argument for IORING_REGISTER_SYNC_CANCEL
*/
struct io_uring_sync_cancel_reg {
__u64 addr;
__s32 fd;
__u32 flags;
struct __kernel_timespec timeout;
__u64 pad[4];
};
/*
* Argument for IORING_REGISTER_FILE_ALLOC_RANGE
* The range is specified as [off, off + len)
*/
struct io_uring_file_index_range {
__u32 off;
__u32 len;
__u64 resv;
};
struct io_uring_recvmsg_out {
__u32 namelen;
__u32 controllen;
__u32 payloadlen;
__u32 flags;
};
#endif
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for io_uring
obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
sync.o advise.o filetable.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o
obj-$(CONFIG_IO_WQ) += io-wq.o
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/io_uring.h>
#include <uapi/linux/fadvise.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "advise.h"
struct io_fadvise {
struct file *file;
u64 offset;
u32 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
u32 len;
u32 advice;
};
int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req);
if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
return -EINVAL;
ma->addr = READ_ONCE(sqe->addr);
ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
io_req_set_res(req, ret, 0);
return IOU_OK;
#else
return -EOPNOTSUPP;
#endif
}
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req);
if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
return -EINVAL;
fa->offset = READ_ONCE(sqe->off);
fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice);
return 0;
}
int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK) {
switch (fa->advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
break;
default:
return -EAGAIN;
}
}
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
// SPDX-License-Identifier: GPL-2.0
int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_madvise(struct io_kiocb *req, unsigned int issue_flags);
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_fadvise(struct io_kiocb *req, unsigned int issue_flags);
#ifndef IOU_ALLOC_CACHE_H
#define IOU_ALLOC_CACHE_H
/*
* Don't allow the cache to grow beyond this size.
*/
#define IO_ALLOC_CACHE_MAX 512
struct io_cache_entry {
struct hlist_node node;
};
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
struct io_cache_entry *entry)
{
if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
cache->nr_cached++;
hlist_add_head(&entry->node, &cache->list);
return true;
}
return false;
}
static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
{
if (!hlist_empty(&cache->list)) {
struct hlist_node *node = cache->list.first;
hlist_del(node);
return container_of(node, struct io_cache_entry, node);
}
return NULL;
}
static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
{
INIT_HLIST_HEAD(&cache->list);
cache->nr_cached = 0;
}
static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
void (*free)(struct io_cache_entry *))
{
while (!hlist_empty(&cache->list)) {
struct hlist_node *node = cache->list.first;
hlist_del(node);
free(container_of(node, struct io_cache_entry, node));
}
cache->nr_cached = 0;
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "tctx.h"
#include "poll.h"
#include "timeout.h"
#include "cancel.h"
struct io_cancel {
struct file *file;
u64 addr;
u32 flags;
s32 fd;
};
#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_cancel_data *cd = data;
if (req->ctx != cd->ctx)
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
;
} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
if (req->file != cd->file)
return false;
} else {
if (req->cqe.user_data != cd->data)
return false;
}
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
if (cd->seq == req->work.cancel_seq)
return false;
req->work.cancel_seq = cd->seq;
}
return true;
}
static int io_async_cancel_one(struct io_uring_task *tctx,
struct io_cancel_data *cd)
{
enum io_wq_cancel cancel_ret;
int ret = 0;
bool all;
if (!tctx || !tctx->io_wq)
return -ENOENT;
all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
break;
case IO_WQ_CANCEL_RUNNING:
ret = -EALREADY;
break;
case IO_WQ_CANCEL_NOTFOUND:
ret = -ENOENT;
break;
}
return ret;
}
int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
unsigned issue_flags)
{
struct io_ring_ctx *ctx = cd->ctx;
int ret;
WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring);
ret = io_async_cancel_one(tctx, cd);
/*
* Fall-through even for -EALREADY, as we may have poll armed
* that need unarming.
*/
if (!ret)
return 0;
ret = io_poll_cancel(ctx, cd, issue_flags);
if (ret != -ENOENT)
return ret;
spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd);
spin_unlock(&ctx->completion_lock);
return ret;
}
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_cancel *cancel = io_kiocb_to_cmd(req);
if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (sqe->off || sqe->len || sqe->splice_fd_in)
return -EINVAL;
cancel->addr = READ_ONCE(sqe->addr);
cancel->flags = READ_ONCE(sqe->cancel_flags);
if (cancel->flags & ~CANCEL_FLAGS)
return -EINVAL;
if (cancel->flags & IORING_ASYNC_CANCEL_FD) {
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
return -EINVAL;
cancel->fd = READ_ONCE(sqe->fd);
}
return 0;
}
static int __io_async_cancel(struct io_cancel_data *cd,
struct io_uring_task *tctx,
unsigned int issue_flags)
{
bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
struct io_ring_ctx *ctx = cd->ctx;
struct io_tctx_node *node;
int ret, nr = 0;
do {
ret = io_try_cancel(tctx, cd, issue_flags);
if (ret == -ENOENT)
break;
if (!all)
return ret;
nr++;
} while (1);
/* slow path, try all io-wq's */
io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
ret = io_async_cancel_one(tctx, cd);
if (ret != -ENOENT) {
if (!all)
break;
nr++;
}
}
io_ring_submit_unlock(ctx, issue_flags);
return all ? nr : ret;
}
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_cancel *cancel = io_kiocb_to_cmd(req);
struct io_cancel_data cd = {
.ctx = req->ctx,
.data = cancel->addr,
.flags = cancel->flags,
.seq = atomic_inc_return(&req->ctx->cancel_seq),
};
struct io_uring_task *tctx = req->task->io_uring;
int ret;
if (cd.flags & IORING_ASYNC_CANCEL_FD) {
if (req->flags & REQ_F_FIXED_FILE ||
cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) {
req->flags |= REQ_F_FIXED_FILE;
req->file = io_file_get_fixed(req, cancel->fd,
issue_flags);
} else {
req->file = io_file_get_normal(req, cancel->fd);
}
if (!req->file) {
ret = -EBADF;
goto done;
}
cd.file = req->file;
}
ret = __io_async_cancel(&cd, tctx, issue_flags);
done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void init_hash_table(struct io_hash_table *table, unsigned size)
{
unsigned int i;
for (i = 0; i < size; i++) {
spin_lock_init(&table->hbs[i].lock);
INIT_HLIST_HEAD(&table->hbs[i].list);
}
}
static int __io_sync_cancel(struct io_uring_task *tctx,
struct io_cancel_data *cd, int fd)
{
struct io_ring_ctx *ctx = cd->ctx;
/* fixed must be grabbed every time since we drop the uring_lock */
if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
(cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
unsigned long file_ptr;
if (unlikely(fd > ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
cd->file = (struct file *) (file_ptr & FFS_MASK);
if (!cd->file)
return -EBADF;
}
return __io_async_cancel(cd, tctx, 0);
}
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_cancel_data cd = {
.ctx = ctx,
.seq = atomic_inc_return(&ctx->cancel_seq),
};
ktime_t timeout = KTIME_MAX;
struct io_uring_sync_cancel_reg sc;
struct fd f = { };
DEFINE_WAIT(wait);
int ret;
if (copy_from_user(&sc, arg, sizeof(sc)))
return -EFAULT;
if (sc.flags & ~CANCEL_FLAGS)
return -EINVAL;
if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
return -EINVAL;
cd.data = sc.addr;
cd.flags = sc.flags;
/* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
f = fdget(sc.fd);
if (!f.file)
return -EBADF;
cd.file = f.file;
}
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
/* found something, done! */
if (ret != -EALREADY)
goto out;
if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) {
struct timespec64 ts = {
.tv_sec = sc.timeout.tv_sec,
.tv_nsec = sc.timeout.tv_nsec
};
timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
}
/*
* Keep looking until we get -ENOENT. we'll get woken everytime
* every time a request completes and will retry the cancelation.
*/
do {
cd.seq = atomic_inc_return(&ctx->cancel_seq);
prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE);
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
if (ret != -EALREADY)
break;
mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig();
if (ret < 0) {
mutex_lock(&ctx->uring_lock);
break;
}
ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS);
mutex_lock(&ctx->uring_lock);
if (!ret) {
ret = -ETIME;
break;
}
} while (1);
finish_wait(&ctx->cq_wait, &wait);
if (ret == -ENOENT || ret > 0)
ret = 0;
out:
fdput(f);
return ret;
}
// SPDX-License-Identifier: GPL-2.0
#include <linux/io_uring_types.h>
struct io_cancel_data {
struct io_ring_ctx *ctx;
union {
u64 data;
struct file *file;
};
u32 flags;
int seq;
};
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
unsigned int issue_flags);
void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/io_uring.h>
#include <linux/eventpoll.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "epoll.h"
#if defined(CONFIG_EPOLL)
struct io_epoll {
struct file *file;
int epfd;
int op;
int fd;
struct epoll_event event;
};
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_epoll *epoll = io_kiocb_to_cmd(req);
pr_warn_once("%s: epoll_ctl support in io_uring is deprecated and will "
"be removed in a future Linux kernel version.\n",
current->comm);
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
epoll->epfd = READ_ONCE(sqe->fd);
epoll->op = READ_ONCE(sqe->len);
epoll->fd = READ_ONCE(sqe->off);
if (ep_op_has_event(epoll->op)) {
struct epoll_event __user *ev;
ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (copy_from_user(&epoll->event, ev, sizeof(*ev)))
return -EFAULT;
}
return 0;
}
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_epoll *ie = io_kiocb_to_cmd(req);
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
#endif
// SPDX-License-Identifier: GPL-2.0
#if defined(CONFIG_EPOLL)
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "cancel.h"
#include "rsrc.h"
#ifdef CONFIG_PROC_FS
static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
kernel_cap_t cap;
unsigned __capi;
int g;
seq_printf(m, "%5d\n", id);
seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
seq_puts(m, "\n\tGroups:\t");
gi = cred->group_info;
for (g = 0; g < gi->ngroups; g++) {
seq_put_decimal_ull(m, g ? " " : "",
from_kgid_munged(uns, gi->gid[g]));
}
seq_puts(m, "\n\tCapEff:\t");
cap = cred->cap_effective;
CAP_FOR_EACH_U32(__capi)
seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
seq_putc(m, '\n');
return 0;
}
static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
struct seq_file *m)
{
struct io_sq_data *sq = NULL;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
unsigned int sq_head = READ_ONCE(r->sq.head);
unsigned int sq_tail = READ_ONCE(r->sq.tail);
unsigned int cq_head = READ_ONCE(r->cq.head);
unsigned int cq_tail = READ_ONCE(r->cq.tail);
unsigned int cq_shift = 0;
unsigned int sq_entries, cq_entries;
bool has_lock;
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
unsigned int i;
if (is_cqe32)
cq_shift = 1;
/*
* we may get imprecise sqe and cqe info if uring is actively running
* since we get cached_sq_head and cached_cq_tail without uring_lock
* and sq_tail and cq_head are changed by userspace. But it's ok since
* we usually use these info when it is stuck.
*/
seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
seq_printf(m, "SqHead:\t%u\n", sq_head);
seq_printf(m, "SqTail:\t%u\n", sq_tail);
seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
seq_printf(m, "CqHead:\t%u\n", cq_head);
seq_printf(m, "CqTail:\t%u\n", cq_tail);
seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
for (i = 0; i < sq_entries; i++) {
unsigned int entry = i + sq_head;
unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
struct io_uring_sqe *sqe;
if (sq_idx > sq_mask)
continue;
sqe = &ctx->sq_sqes[sq_idx];
seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
sq_idx, sqe->opcode, sqe->fd, sqe->flags,
sqe->user_data);
}
seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
for (i = 0; i < cq_entries; i++) {
unsigned int entry = i + cq_head;
struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
if (!is_cqe32) {
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
} else {
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
"extra1:%llu, extra2:%llu\n",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
}
}
/*
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
* since fdinfo case grabs it in the opposite direction of normal use
* cases. If we fail to get the lock, we just don't iterate any
* structures that could be going away outside the io_uring mutex.
*/
has_lock = mutex_trylock(&ctx->uring_lock);
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
sq = ctx->sq_data;
if (!sq->thread)
sq = NULL;
}
seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct file *f = io_file_from_index(&ctx->file_table, i);
if (f)
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
else
seq_printf(m, "%5u: <none>\n", i);
}
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
struct io_mapped_ubuf *buf = ctx->user_bufs[i];
unsigned int len = buf->ubuf_end - buf->ubuf;
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
}
if (has_lock && !xa_empty(&ctx->personalities)) {
unsigned long index;
const struct cred *cred;
seq_printf(m, "Personalities:\n");
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
if (has_lock)
mutex_unlock(&ctx->uring_lock);
seq_puts(m, "PollList:\n");
for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
struct io_kiocb *req;
spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
task_work_pending(req->task));
spin_unlock(&hb->lock);
}
seq_puts(m, "CqOverflowList:\n");
spin_lock(&ctx->completion_lock);
list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
struct io_uring_cqe *cqe = &ocqe->cqe;
seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
cqe->user_data, cqe->res, cqe->flags);
}
spin_unlock(&ctx->completion_lock);
}
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
if (percpu_ref_tryget(&ctx->refs)) {
__io_uring_show_fdinfo(ctx, m);
percpu_ref_put(&ctx->refs);
}
}
#endif
// SPDX-License-Identifier: GPL-2.0
void io_uring_show_fdinfo(struct seq_file *m, struct file *f);
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "rsrc.h"
#include "filetable.h"
static int io_file_bitmap_get(struct io_ring_ctx *ctx)
{
struct io_file_table *table = &ctx->file_table;
unsigned long nr = ctx->file_alloc_end;
int ret;
do {
ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
if (ret != nr)
return ret;
if (table->alloc_hint == ctx->file_alloc_start)
break;
nr = table->alloc_hint;
table->alloc_hint = ctx->file_alloc_start;
} while (1);
return -ENFILE;
}
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
table->files = kvcalloc(nr_files, sizeof(table->files[0]),
GFP_KERNEL_ACCOUNT);
if (unlikely(!table->files))
return false;
table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
if (unlikely(!table->bitmap)) {
kvfree(table->files);
return false;
}
return true;
}
void io_free_file_tables(struct io_file_table *table)
{
kvfree(table->files);
bitmap_free(table->bitmap);
table->files = NULL;
table->bitmap = NULL;
}
static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
u32 slot_index)
__must_hold(&req->ctx->uring_lock)
{
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret;
if (io_is_uring_fops(file))
return -EBADF;
if (!ctx->file_data)
return -ENXIO;
if (slot_index >= ctx->nr_user_files)
return -EINVAL;
slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
if (file_slot->file_ptr) {
struct file *old_file;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;
old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
ctx->rsrc_node, old_file);
if (ret)
goto err;
file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
}
ret = io_scm_file_account(ctx, file);
if (!ret) {
*io_get_tag_slot(ctx->file_data, slot_index) = 0;
io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, slot_index);
}
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
if (ret)
fput(file);
return ret;
}
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
unsigned int file_slot)
{
bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
int ret;
if (alloc_slot) {
ret = io_file_bitmap_get(ctx);
if (unlikely(ret < 0))
return ret;
file_slot = ret;
} else {
file_slot--;
}
ret = io_install_fixed_file(ctx, file, file_slot);
if (!ret && alloc_slot)
ret = file_slot;
return ret;
}
/*
* Note when io_fixed_fd_install() returns error value, it will ensure
* fput() is called correspondingly.
*/
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct file *file, unsigned int file_slot)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
io_ring_submit_lock(ctx, issue_flags);
ret = __io_fixed_fd_install(ctx, file, file_slot);
io_ring_submit_unlock(ctx, issue_flags);
if (unlikely(ret < 0))
fput(file);
return ret;
}
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
{
struct io_fixed_file *file_slot;
struct file *file;
int ret;
if (unlikely(!ctx->file_data))
return -ENXIO;
if (offset >= ctx->nr_user_files)
return -EINVAL;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
offset = array_index_nospec(offset, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, offset);
if (!file_slot->file_ptr)
return -EBADF;
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
if (ret)
return ret;
file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
return 0;
}
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
struct io_uring_file_index_range __user *arg)
{
struct io_uring_file_index_range range;
u32 end;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
if (check_add_overflow(range.off, range.len, &end))
return -EOVERFLOW;
if (range.resv || end > ctx->nr_user_files)
return -EINVAL;
io_file_table_set_alloc_range(ctx, range.off, range.len);
return 0;
}
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_FILE_TABLE_H
#define IOU_FILE_TABLE_H
#include <linux/file.h>
#include <linux/io_uring_types.h>
/*
* FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
* and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
* can't safely always dereference the file when the task has exited and ring
* cleanup is done. If a file is tracked and part of SCM, then unix gc on
* process exit may reap it before __io_sqe_files_unregister() is run.
*/
#define FFS_NOWAIT 0x1UL
#define FFS_ISREG 0x2UL
#if defined(CONFIG_64BIT)
#define FFS_SCM 0x4UL
#else
#define IO_URING_SCM_ALL
#define FFS_SCM 0x0UL
#endif
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
void io_free_file_tables(struct io_file_table *table);
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct file *file, unsigned int file_slot);
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
unsigned int file_slot);
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
struct io_uring_file_index_range __user *arg);
unsigned int io_file_get_flags(struct file *file);
static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
{
__clear_bit(bit, table->bitmap);
table->alloc_hint = bit;
}
static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
{
WARN_ON_ONCE(test_bit(bit, table->bitmap));
__set_bit(bit, table->bitmap);
table->alloc_hint = bit + 1;
}
static inline struct io_fixed_file *
io_fixed_file_slot(struct io_file_table *table, unsigned i)
{
return &table->files[i];
}
static inline struct file *io_file_from_index(struct io_file_table *table,
int index)
{
struct io_fixed_file *slot = io_fixed_file_slot(table, index);
return (struct file *) (slot->file_ptr & FFS_MASK);
}
static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
struct file *file)
{
unsigned long file_ptr = (unsigned long) file;
file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
}
static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
{
ctx->file_table.alloc_hint = ctx->file_alloc_start;
}
static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx,
unsigned off, unsigned len)
{
ctx->file_alloc_start = off;
ctx->file_alloc_end = off + len;
io_reset_alloc_hint(ctx);
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "../fs/internal.h"
#include "io_uring.h"
#include "fs.h"
struct io_rename {
struct file *file;
int old_dfd;
int new_dfd;
struct filename *oldpath;
struct filename *newpath;
int flags;
};
struct io_unlink {
struct file *file;
int dfd;
int flags;
struct filename *filename;
};
struct io_mkdir {
struct file *file;
int dfd;
umode_t mode;
struct filename *filename;
};
struct io_link {
struct file *file;
int old_dfd;
int new_dfd;
struct filename *oldpath;
struct filename *newpath;
int flags;
};
int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_rename *ren = io_kiocb_to_cmd(req);
const char __user *oldf, *newf;
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
ren->old_dfd = READ_ONCE(sqe->fd);
oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
ren->new_dfd = READ_ONCE(sqe->len);
ren->flags = READ_ONCE(sqe->rename_flags);
ren->oldpath = getname(oldf);
if (IS_ERR(ren->oldpath))
return PTR_ERR(ren->oldpath);
ren->newpath = getname(newf);
if (IS_ERR(ren->newpath)) {
putname(ren->oldpath);
return PTR_ERR(ren->newpath);
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rename *ren = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
ren->newpath, ren->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_renameat_cleanup(struct io_kiocb *req)
{
struct io_rename *ren = io_kiocb_to_cmd(req);
putname(ren->oldpath);
putname(ren->newpath);
}
int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_unlink *un = io_kiocb_to_cmd(req);
const char __user *fname;
if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
un->dfd = READ_ONCE(sqe->fd);
un->flags = READ_ONCE(sqe->unlink_flags);
if (un->flags & ~AT_REMOVEDIR)
return -EINVAL;
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
un->filename = getname(fname);
if (IS_ERR(un->filename))
return PTR_ERR(un->filename);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_unlink *un = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
if (un->flags & AT_REMOVEDIR)
ret = do_rmdir(un->dfd, un->filename);
else
ret = do_unlinkat(un->dfd, un->filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_unlinkat_cleanup(struct io_kiocb *req)
{
struct io_unlink *ul = io_kiocb_to_cmd(req);
putname(ul->filename);
}
int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_mkdir *mkd = io_kiocb_to_cmd(req);
const char __user *fname;
if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
mkd->dfd = READ_ONCE(sqe->fd);
mkd->mode = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
mkd->filename = getname(fname);
if (IS_ERR(mkd->filename))
return PTR_ERR(mkd->filename);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_mkdir *mkd = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_mkdirat_cleanup(struct io_kiocb *req)
{
struct io_mkdir *md = io_kiocb_to_cmd(req);
putname(md->filename);
}
int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_link *sl = io_kiocb_to_cmd(req);
const char __user *oldpath, *newpath;
if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
sl->new_dfd = READ_ONCE(sqe->fd);
oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
sl->oldpath = getname(oldpath);
if (IS_ERR(sl->oldpath))
return PTR_ERR(sl->oldpath);
sl->newpath = getname(newpath);
if (IS_ERR(sl->newpath)) {
putname(sl->oldpath);
return PTR_ERR(sl->newpath);
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_link *sl = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_link *lnk = io_kiocb_to_cmd(req);
const char __user *oldf, *newf;
if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
lnk->old_dfd = READ_ONCE(sqe->fd);
lnk->new_dfd = READ_ONCE(sqe->len);
oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
lnk->flags = READ_ONCE(sqe->hardlink_flags);
lnk->oldpath = getname(oldf);
if (IS_ERR(lnk->oldpath))
return PTR_ERR(lnk->oldpath);
lnk->newpath = getname(newf);
if (IS_ERR(lnk->newpath)) {
putname(lnk->oldpath);
return PTR_ERR(lnk->newpath);
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_link *lnk = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
lnk->newpath, lnk->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_link_cleanup(struct io_kiocb *req)
{
struct io_link *sl = io_kiocb_to_cmd(req);
putname(sl->oldpath);
putname(sl->newpath);
}
// SPDX-License-Identifier: GPL-2.0
int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_renameat(struct io_kiocb *req, unsigned int issue_flags);
void io_renameat_cleanup(struct io_kiocb *req);
int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags);
void io_unlinkat_cleanup(struct io_kiocb *req);
int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags);
void io_mkdirat_cleanup(struct io_kiocb *req);
int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags);
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_linkat(struct io_kiocb *req, unsigned int issue_flags);
void io_link_cleanup(struct io_kiocb *req);
......@@ -18,6 +18,8 @@
#include <uapi/linux/io_uring.h>
#include "io-wq.h"
#include "slist.h"
#include "io_uring.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
......@@ -518,23 +520,11 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
return NULL;
}
static bool io_flush_signals(void)
{
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
__set_current_state(TASK_RUNNING);
clear_notify_signal();
if (task_work_pending(current))
task_work_run();
return true;
}
return false;
}
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work) {
io_flush_signals();
io_run_task_work();
cond_resched();
}
......@@ -654,7 +644,7 @@ static int io_wqe_worker(void *data)
last_timeout = false;
__io_worker_idle(wqe, worker);
raw_spin_unlock(&wqe->lock);
if (io_flush_signals())
if (io_run_task_work())
continue;
ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
if (signal_pending(current)) {
......
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
#include <linux/refcount.h>
#include <linux/io_uring_types.h>
struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
enum io_wq_cancel {
IO_WQ_CANCEL_OK, /* cancelled before started */
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_hash {
refcount_t refs;
unsigned long map;
struct wait_queue_head wait;
};
static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
if (refcount_dec_and_test(&hash->refs))
kfree(hash);
}
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all);
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
#else
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
#endif
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER) &&
current->worker_private;
}
#endif
此差异已折叠。
#ifndef IOU_CORE_H
#define IOU_CORE_H
#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/io_uring_types.h>
#include "io-wq.h"
#include "slist.h"
#include "filetable.h"
#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
#endif
enum {
IOU_OK = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
/*
* Intended only when both REQ_F_POLLED and REQ_F_APOLL_MULTISHOT
* are set to indicate to the poll runner that multishot should be
* removed and the result is set on req->cqe.res.
*/
IOU_STOP_MULTISHOT = -ECANCELED,
};
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
bool io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(void);
void io_req_complete_failed(struct io_kiocb *req, s32 res);
void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
void io_req_complete_post(struct io_kiocb *req);
void __io_req_complete_post(struct io_kiocb *req);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
bool allow_overflow);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
return req->flags & REQ_F_FIXED_FILE;
}
bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_work_add(struct io_kiocb *req);
void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
void io_req_task_complete(struct io_kiocb *req, bool *locked);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
void io_req_task_submit(struct io_kiocb *req, bool *locked);
void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx);
int io_poll_issue(struct io_kiocb *req, bool *locked);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
int io_req_prep_async(struct io_kiocb *req);
struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
void io_wq_submit_work(struct io_wq_work *work);
void io_free_req(struct io_kiocb *req);
void io_queue_next(struct io_kiocb *req);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
static inline void io_cq_lock(struct io_ring_ctx *ctx)
__acquires(ctx->completion_lock)
{
spin_lock(&ctx->completion_lock);
}
void io_cq_unlock_post(struct io_ring_ctx *ctx);
static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
{
if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
struct io_uring_cqe *cqe = ctx->cqe_cached;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return cqe;
}
return __io_get_cqe(ctx);
}
static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
struct io_uring_cqe *cqe;
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
cqe = io_get_cqe(ctx);
if (unlikely(!cqe))
return io_req_cqe_overflow(req);
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
memcpy(cqe, &req->cqe, sizeof(*cqe));
if (ctx->flags & IORING_SETUP_CQE32) {
u64 extra1 = 0, extra2 = 0;
if (req->flags & REQ_F_CQE32_INIT) {
extra1 = req->extra1;
extra2 = req->extra2;
}
WRITE_ONCE(cqe->big_cqe[0], extra1);
WRITE_ONCE(cqe->big_cqe[1], extra2);
}
return true;
}
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
if (req->flags & REQ_F_CQE_SKIP) {
req->flags &= ~REQ_F_CQE_SKIP;
req->flags |= REQ_F_SKIP_LINK_CQES;
}
}
static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
{
req->cqe.res = res;
req->cqe.flags = cflags;
}
static inline bool req_has_async_data(struct io_kiocb *req)
{
return req->flags & REQ_F_ASYNC_DATA;
}
static inline void io_put_file(struct file *file)
{
if (file)
fput(file);
}
static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
unsigned issue_flags)
{
lockdep_assert_held(&ctx->uring_lock);
if (issue_flags & IO_URING_F_UNLOCKED)
mutex_unlock(&ctx->uring_lock);
}
static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
unsigned issue_flags)
{
/*
* "Normal" inline submissions always hold the uring_lock, since we
* grab it from the system call. Same is true for the SQPOLL offload.
* The only exception is when we've detached the request and issue it
* from an async worker thread, grab the lock for that case.
*/
if (issue_flags & IO_URING_F_UNLOCKED)
mutex_lock(&ctx->uring_lock);
lockdep_assert_held(&ctx->uring_lock);
}
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* wake_up_all() may seem excessive, but io_wake_function() and
* io_should_wake() handle the termination of the loop and only
* wake as many waiters as we need to.
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
}
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
/* make sure SQ entry isn't read before tail */
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
static inline bool io_run_task_work(void)
{
if (test_thread_flag(TIF_NOTIFY_SIGNAL)) {
__set_current_state(TASK_RUNNING);
clear_notify_signal();
if (task_work_pending(current))
task_work_run();
return true;
}
return false;
}
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
if (!*locked) {
mutex_lock(&ctx->uring_lock);
*locked = true;
}
}
/*
* Don't complete immediately but use deferred completion infrastructure.
* Protected by ->uring_lock and can only be used either with
* IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex.
*/
static inline void io_req_complete_defer(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_submit_state *state = &req->ctx->submit_state;
lockdep_assert_held(&req->ctx->uring_lock);
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd))
__io_commit_cqring_flush(ctx);
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "opdef.h"
#include "kbuf.h"
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
#define BGID_ARRAY 64
struct io_provide_buf {
struct file *file;
__u64 addr;
__u32 len;
__u32 bgid;
__u16 nbufs;
__u16 bid;
};
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
if (ctx->io_bl && bgid < BGID_ARRAY)
return &ctx->io_bl[bgid];
return xa_load(&ctx->io_bl_xa, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned int bgid)
{
bl->bgid = bgid;
if (bgid < BGID_ARRAY)
return 0;
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
struct io_buffer *buf;
/*
* For legacy provided buffer mode, don't recycle if we already did
* IO to this buffer. For ring-mapped provided buffer mode, we should
* increment ring->head to explicitly monopolize the buffer to avoid
* multiple use.
*/
if (req->flags & REQ_F_PARTIAL_IO)
return;
io_ring_submit_lock(ctx, issue_flags);
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
return;
}
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
{
unsigned int cflags;
/*
* We can add this buffer back to two lists:
*
* 1) The io_buffers_cache list. This one is protected by the
* ctx->uring_lock. If we already hold this lock, add back to this
* list as we can grab it from issue as well.
* 2) The io_buffers_comp list. This one is protected by the
* ctx->completion_lock.
*
* We migrate buffers from the comp_list to the issue cache list
* when we need one.
*/
if (req->flags & REQ_F_BUFFER_RING) {
/* no buffers to recycle for this case */
cflags = __io_put_kbuf_list(req, NULL);
} else if (issue_flags & IO_URING_F_UNLOCKED) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
spin_unlock(&ctx->completion_lock);
} else {
lockdep_assert_held(&req->ctx->uring_lock);
cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
}
return cflags;
}
static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl)
{
if (!list_empty(&bl->buf_list)) {
struct io_buffer *kbuf;
kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&kbuf->list);
if (*len == 0 || *len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
req->kbuf = kbuf;
req->buf_index = kbuf->bid;
return u64_to_user_ptr(kbuf->addr);
}
return NULL;
}
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl,
unsigned int issue_flags)
{
struct io_uring_buf_ring *br = bl->buf_ring;
struct io_uring_buf *buf;
__u16 head = bl->head;
if (unlikely(smp_load_acquire(&br->tail) == head))
return NULL;
head &= bl->mask;
if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
buf = &br->bufs[head];
} else {
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
buf = page_address(bl->buf_pages[index]);
buf += off;
}
if (*len == 0 || *len > buf->len)
*len = buf->len;
req->flags |= REQ_F_BUFFER_RING;
req->buf_list = bl;
req->buf_index = buf->bid;
if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
/*
* If we came in unlocked, we have no choice but to consume the
* buffer here, otherwise nothing ensures that the buffer won't
* get used by others. This does mean it'll be pinned until the
* IO completes, coming in unlocked means we're being called from
* io-wq context and there may be further retries in async hybrid
* mode. For the locked case, the caller must call commit when
* the transfer completes (or if we get -EAGAIN and must poll of
* retry).
*/
req->buf_list = NULL;
bl->head++;
}
return u64_to_user_ptr(buf->addr);
}
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
void __user *ret = NULL;
io_ring_submit_lock(req->ctx, issue_flags);
bl = io_buffer_get_list(ctx, req->buf_index);
if (likely(bl)) {
if (bl->buf_nr_pages)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
else
ret = io_provided_buffer_select(req, len, bl);
}
io_ring_submit_unlock(req->ctx, issue_flags);
return ret;
}
static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{
int i;
ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
GFP_KERNEL);
if (!ctx->io_bl)
return -ENOMEM;
for (i = 0; i < BGID_ARRAY; i++) {
INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
ctx->io_bl[i].bgid = i;
}
return 0;
}
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;
/* shouldn't happen */
if (!nbufs)
return 0;
if (bl->buf_nr_pages) {
int j;
i = bl->buf_ring->tail - bl->head;
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
kvfree(bl->buf_pages);
bl->buf_pages = NULL;
bl->buf_nr_pages = 0;
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
return i;
}
/* the head kbuf is the list itself */
while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
if (++i == nbufs)
return i;
cond_resched();
}
i++;
return i;
}
void io_destroy_buffers(struct io_ring_ctx *ctx)
{
struct io_buffer_list *bl;
unsigned long index;
int i;
for (i = 0; i < BGID_ARRAY; i++) {
if (!ctx->io_bl)
break;
__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
}
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
__io_remove_buffers(ctx, bl, -1U);
kfree(bl);
}
while (!list_empty(&ctx->io_buffers_pages)) {
struct page *page;
page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
list_del_init(&page->lru);
__free_page(page);
}
}
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req);
u64 tmp;
if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
if (!tmp || tmp > USHRT_MAX)
return -EINVAL;
memset(p, 0, sizeof(*p));
p->nbufs = tmp;
p->bgid = READ_ONCE(sqe->buf_group);
return 0;
}
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
bl = io_buffer_get_list(ctx, p->bgid);
if (bl) {
ret = -EINVAL;
/* can't use provide/remove buffers command on mapped buffers */
if (!bl->buf_nr_pages)
ret = __io_remove_buffers(ctx, bl, p->nbufs);
}
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
io_req_set_res(req, ret, 0);
__io_req_complete(req, issue_flags);
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
unsigned long size, tmp_check;
struct io_provide_buf *p = io_kiocb_to_cmd(req);
u64 tmp;
if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
if (!tmp || tmp > USHRT_MAX)
return -E2BIG;
p->nbufs = tmp;
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
&size))
return -EOVERFLOW;
if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
return -EOVERFLOW;
size = (unsigned long)p->len * p->nbufs;
if (!access_ok(u64_to_user_ptr(p->addr), size))
return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group);
tmp = READ_ONCE(sqe->off);
if (tmp > USHRT_MAX)
return -E2BIG;
p->bid = tmp;
return 0;
}
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
{
struct io_buffer *buf;
struct page *page;
int bufs_in_page;
/*
* Completions that don't happen inline (eg not under uring_lock) will
* add to ->io_buffers_comp. If we don't have any free buffers, check
* the completion list and splice those entries first.
*/
if (!list_empty_careful(&ctx->io_buffers_comp)) {
spin_lock(&ctx->completion_lock);
if (!list_empty(&ctx->io_buffers_comp)) {
list_splice_init(&ctx->io_buffers_comp,
&ctx->io_buffers_cache);
spin_unlock(&ctx->completion_lock);
return 0;
}
spin_unlock(&ctx->completion_lock);
}
/*
* No free buffers and no completion entries either. Allocate a new
* page worth of buffer entries and add those to our freelist.
*/
page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
return -ENOMEM;
list_add(&page->lru, &ctx->io_buffers_pages);
buf = page_address(page);
bufs_in_page = PAGE_SIZE / sizeof(*buf);
while (bufs_in_page) {
list_add_tail(&buf->list, &ctx->io_buffers_cache);
buf++;
bufs_in_page--;
}
return 0;
}
static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
struct io_buffer_list *bl)
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
if (list_empty(&ctx->io_buffers_cache) &&
io_refill_buffer_cache(ctx))
break;
buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
list);
list_move_tail(&buf->list, &bl->buf_list);
buf->addr = addr;
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
buf->bid = bid;
buf->bgid = pbuf->bgid;
addr += pbuf->len;
bid++;
cond_resched();
}
return i ? 0 : -ENOMEM;
}
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
io_ring_submit_lock(ctx, issue_flags);
if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
ret = io_init_bl_list(ctx);
if (ret)
goto err;
}
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl) {
ret = -ENOMEM;
goto err;
}
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) {
kfree(bl);
goto err;
}
}
/* can't add buffers via this command for a mapped buffer ring */
if (bl->buf_nr_pages) {
ret = -EINVAL;
goto err;
}
ret = io_add_buffers(ctx, p, bl);
err:
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
io_req_set_res(req, ret, 0);
__io_req_complete(req, issue_flags);
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_ring *br;
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
struct page **pages;
int nr_pages;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
return -EINVAL;
if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
int ret = io_init_bl_list(ctx);
if (ret)
return ret;
}
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return -ENOMEM;
}
pages = io_pin_pages(reg.ring_addr,
struct_size(br, bufs, reg.ring_entries),
&nr_pages);
if (IS_ERR(pages)) {
kfree(free_bl);
return PTR_ERR(pages);
}
br = page_address(pages[0]);
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->nr_entries = reg.ring_entries;
bl->buf_ring = br;
bl->mask = reg.ring_entries - 1;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
}
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl)
return -ENOENT;
if (!bl->buf_nr_pages)
return -EINVAL;
__io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
kfree(bl);
}
return 0;
}
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_KBUF_H
#define IOU_KBUF_H
#include <uapi/linux/io_uring.h>
struct io_buffer_list {
/*
* If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
* then these are classic provided buffers and ->buf_list is used.
*/
union {
struct list_head buf_list;
struct {
struct page **buf_pages;
struct io_uring_buf_ring *buf_ring;
};
};
__u16 bgid;
/* below is for ring provided buffers */
__u16 buf_nr_pages;
__u16 nr_entries;
__u16 head;
__u16 mask;
};
struct io_buffer {
struct list_head list;
__u64 addr;
__u32 len;
__u16 bid;
__u16 bgid;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
{
/*
* We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
* the flag and hence ensure that bl->head doesn't get incremented.
* If the tail has already been incremented, hang on to it.
* The exception is partial io, that case we should increment bl->head
* to monopolize the buffer.
*/
if (req->buf_list) {
if (req->flags & REQ_F_PARTIAL_IO) {
/*
* If we end up here, then the io_uring_lock has
* been kept held since we retrieved the buffer.
* For the io-wq case, we already cleared
* req->buf_list when the buffer was retrieved,
* hence it cannot be set here for that case.
*/
req->buf_list->head++;
req->buf_list = NULL;
} else {
req->buf_index = req->buf_list->bgid;
req->flags &= ~REQ_F_BUFFER_RING;
}
}
}
static inline bool io_do_buffer_select(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_BUFFER_SELECT))
return false;
return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
}
static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
{
/*
* READV uses fields in `struct io_rw` (len/addr) to stash the selected
* buffer data. However if that buffer is recycled the original request
* data stored in addr is lost. Therefore forbid recycling for now.
*/
if (req->opcode == IORING_OP_READV)
return;
if (req->flags & REQ_F_BUFFER_SELECTED)
io_kbuf_recycle_legacy(req, issue_flags);
if (req->flags & REQ_F_BUFFER_RING)
io_kbuf_recycle_ring(req);
}
static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
struct list_head *list)
{
unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
if (req->flags & REQ_F_BUFFER_RING) {
if (req->buf_list) {
req->buf_index = req->buf_list->bgid;
req->buf_list->head++;
}
req->flags &= ~REQ_F_BUFFER_RING;
} else {
req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
}
return ret;
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
lockdep_assert_held(&req->ctx->completion_lock);
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
}
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
unsigned issue_flags)
{
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf(req, issue_flags);
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "rsrc.h"
#include "filetable.h"
#include "msg_ring.h"
struct io_msg {
struct file *file;
u64 user_data;
u32 len;
u32 cmd;
u32 src_fd;
u32 dst_fd;
u32 flags;
};
static int io_msg_ring_data(struct io_kiocb *req)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req);
if (msg->src_fd || msg->dst_fd || msg->flags)
return -EINVAL;
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
return 0;
return -EOVERFLOW;
}
static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
struct io_ring_ctx *octx,
unsigned int issue_flags)
{
if (issue_flags & IO_URING_F_UNLOCKED)
mutex_unlock(&ctx->uring_lock);
mutex_unlock(&octx->uring_lock);
}
static int io_double_lock_ctx(struct io_ring_ctx *ctx,
struct io_ring_ctx *octx,
unsigned int issue_flags)
{
/*
* To ensure proper ordering between the two ctxs, we can only
* attempt a trylock on the target. If that fails and we already have
* the source ctx lock, punt to io-wq.
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
if (!mutex_trylock(&octx->uring_lock))
return -EAGAIN;
return 0;
}
/* Always grab smallest value ctx first. We know ctx != octx. */
if (ctx < octx) {
mutex_lock(&ctx->uring_lock);
mutex_lock(&octx->uring_lock);
} else {
mutex_lock(&octx->uring_lock);
mutex_lock(&ctx->uring_lock);
}
return 0;
}
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
unsigned long file_ptr;
struct file *src_file;
int ret;
if (target_ctx == ctx)
return -EINVAL;
ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
if (unlikely(ret))
return ret;
ret = -EBADF;
if (unlikely(msg->src_fd >= ctx->nr_user_files))
goto out_unlock;
msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
src_file = (struct file *) (file_ptr & FFS_MASK);
get_file(src_file);
ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
if (ret < 0) {
fput(src_file);
goto out_unlock;
}
if (msg->flags & IORING_MSG_RING_CQE_SKIP)
goto out_unlock;
/*
* If this fails, the target still received the file descriptor but
* wasn't notified of the fact. This means that if this request
* completes with -EOVERFLOW, then the sender must ensure that a
* later IORING_OP_MSG_RING delivers the message.
*/
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
ret = -EOVERFLOW;
out_unlock:
io_double_unlock_ctx(ctx, target_ctx, issue_flags);
return ret;
}
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
if (unlikely(sqe->buf_index || sqe->personality))
return -EINVAL;
msg->user_data = READ_ONCE(sqe->off);
msg->len = READ_ONCE(sqe->len);
msg->cmd = READ_ONCE(sqe->addr);
msg->src_fd = READ_ONCE(sqe->addr3);
msg->dst_fd = READ_ONCE(sqe->file_index);
msg->flags = READ_ONCE(sqe->msg_ring_flags);
if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
return -EINVAL;
return 0;
}
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
int ret;
ret = -EBADFD;
if (!io_is_uring_fops(req->file))
goto done;
switch (msg->cmd) {
case IORING_MSG_DATA:
ret = io_msg_ring_data(req);
break;
case IORING_MSG_SEND_FD:
ret = io_msg_send_fd(req, issue_flags);
break;
default:
ret = -EINVAL;
break;
}
done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
/* put file to avoid an attempt to IOPOLL the req */
io_put_file(req->file);
req->file = NULL;
return IOU_OK;
}
// SPDX-License-Identifier: GPL-2.0
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
此差异已折叠。
// SPDX-License-Identifier: GPL-2.0
#include <linux/net.h>
#include <linux/uio.h>
#include "alloc_cache.h"
#if defined(CONFIG_NET)
struct io_async_msghdr {
union {
struct iovec fast_iov[UIO_FASTIOV];
struct {
struct iovec fast_iov_one;
__kernel_size_t controllen;
int namelen;
__kernel_size_t payloadlen;
};
struct io_cache_entry cache;
};
/* points to an allocated iov, if NULL we use fast_iov instead */
struct iovec *free_iov;
struct sockaddr __user *uaddr;
struct msghdr msg;
struct sockaddr_storage addr;
};
struct io_async_connect {
struct sockaddr_storage address;
};
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_shutdown(struct io_kiocb *req, unsigned int issue_flags);
int io_sendmsg_prep_async(struct io_kiocb *req);
void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req);
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags);
int io_send(struct io_kiocb *req, unsigned int issue_flags);
int io_recvmsg_prep_async(struct io_kiocb *req);
int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags);
int io_recv(struct io_kiocb *req, unsigned int issue_flags);
int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_accept(struct io_kiocb *req, unsigned int issue_flags);
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_socket(struct io_kiocb *req, unsigned int issue_flags);
int io_connect_prep_async(struct io_kiocb *req);
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
void io_netmsg_cache_free(struct io_cache_entry *entry);
#else
static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
{
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "nop.h"
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return 0;
}
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
io_req_set_res(req, 0, 0);
return IOU_OK;
}
// SPDX-License-Identifier: GPL-2.0
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_nop(struct io_kiocb *req, unsigned int issue_flags);
此差异已折叠。
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_OP_DEF_H
#define IOU_OP_DEF_H
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
/* should block plug */
unsigned plug : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
unsigned poll_exclusive : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
/* supports ioprio */
unsigned ioprio : 1;
/* supports iopoll */
unsigned iopoll : 1;
/* size of async data needed, if any */
unsigned short async_size;
const char *name;
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep_async)(struct io_kiocb *);
void (*cleanup)(struct io_kiocb *);
};
extern const struct io_op_def io_op_defs[];
void io_uring_optable_init(void);
#endif
此差异已折叠。
// SPDX-License-Identifier: GPL-2.0
int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
unsigned int offset);
int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_openat(struct io_kiocb *req, unsigned int issue_flags);
void io_open_cleanup(struct io_kiocb *req);
int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags);
此差异已折叠。
// SPDX-License-Identifier: GPL-2.0
#include "alloc_cache.h"
enum {
IO_APOLL_OK,
IO_APOLL_ABORTED,
IO_APOLL_READY
};
struct io_poll {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
struct wait_queue_entry wait;
};
struct async_poll {
union {
struct io_poll poll;
struct io_cache_entry cache;
};
struct io_poll *double_poll;
};
int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
struct io_cancel_data;
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned issue_flags);
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
bool cancel_all);
void io_apoll_cache_free(struct io_cache_entry *entry);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册