io_uring.c 230.8 KB
Newer Older
J
Jens Axboe 已提交
1 2 3 4 5 6
// SPDX-License-Identifier: GPL-2.0
/*
 * Shared application/kernel submission and completion ring pairs, for
 * supporting fast/efficient IO.
 *
 * A note on the read/write ordering memory barriers that are matched between
S
Stefan Bühler 已提交
7 8 9 10 11 12 13
 * the application and kernel side.
 *
 * After the application reads the CQ ring tail, it must use an
 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
 * before writing the tail (using smp_load_acquire to read the tail will
 * do). It also needs a smp_mb() before updating CQ head (ordering the
 * entry load(s) with the head store), pairing with an implicit barrier
P
Pavel Begunkov 已提交
14
 * through a control-dependency in io_get_cqe (smp_store_release to
S
Stefan Bühler 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28
 * store head will do). Failure to do so could lead to reading invalid
 * CQ entries.
 *
 * Likewise, the application must use an appropriate smp_wmb() before
 * writing the SQ tail (ordering SQ entry stores with the tail store),
 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
 * to store the tail will do). And it needs a barrier ordering the SQ
 * head load before writing new SQ entries (smp_load_acquire to read
 * head will do).
 *
 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
 * updating the SQ tail; a full memory barrier smp_mb() is needed
 * between.
J
Jens Axboe 已提交
29 30 31 32 33 34 35 36 37 38 39
 *
 * Also see the examples in the liburing library:
 *
 *	git://git.kernel.dk/liburing
 *
 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
 * from data shared between the kernel and application. This is done both
 * for ordering purposes, but also to ensure that once a value is loaded from
 * data that the application could potentially modify, it remains stable.
 *
 * Copyright (C) 2018-2019 Jens Axboe
C
Christoph Hellwig 已提交
40
 * Copyright (c) 2018-2019 Christoph Hellwig
J
Jens Axboe 已提交
41 42 43 44 45 46
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
47
#include <net/compat.h>
J
Jens Axboe 已提交
48 49
#include <linux/refcount.h>
#include <linux/uio.h>
50
#include <linux/bits.h>
J
Jens Axboe 已提交
51 52 53 54 55 56 57 58 59

#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
60
#include <linux/blk-mq.h>
61
#include <linux/bvec.h>
J
Jens Axboe 已提交
62 63 64
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
J
Jens Axboe 已提交
65
#include <net/scm.h>
J
Jens Axboe 已提交
66 67 68 69
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
70 71
#include <linux/sizes.h>
#include <linux/hugetlb.h>
72
#include <linux/highmem.h>
73 74
#include <linux/namei.h>
#include <linux/fsnotify.h>
J
Jens Axboe 已提交
75
#include <linux/fadvise.h>
76
#include <linux/eventpoll.h>
P
Pavel Begunkov 已提交
77
#include <linux/splice.h>
78
#include <linux/task_work.h>
79
#include <linux/pagemap.h>
80
#include <linux/io_uring.h>
81
#include <linux/audit.h>
82
#include <linux/security.h>
J
Jens Axboe 已提交
83

84 85 86
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>

J
Jens Axboe 已提交
87 88
#include <uapi/linux/io_uring.h>

89
#include "../fs/internal.h"
90
#include "io-wq.h"
J
Jens Axboe 已提交
91

J
Jens Axboe 已提交
92
#include "io_uring_types.h"
93
#include "io_uring.h"
94
#include "refs.h"
95
#include "sqpoll.h"
J
Jens Axboe 已提交
96

97
#include "xattr.h"
98
#include "nop.h"
99
#include "fs.h"
100
#include "splice.h"
101
#include "sync.h"
102
#include "advise.h"
103
#include "openclose.h"
104
#include "uring_cmd.h"
105
#include "epoll.h"
106
#include "statx.h"
107
#include "net.h"
108
#include "msg_ring.h"
109
#include "timeout.h"
110

111
#define IORING_MAX_ENTRIES	32768
112
#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
113

114
/* only define max */
115
#define IORING_MAX_FIXED_FILES	(1U << 20)
116 117
#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
				 IORING_REGISTER_LAST + IORING_OP_LAST)
J
Jens Axboe 已提交
118

119
#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
120 121 122
#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)

123 124
#define IORING_MAX_REG_BUFFERS	(1U << 14)

125 126 127
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)

128 129
#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
130

131
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
132 133
				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
				REQ_F_ASYNC_DATA)
134

135 136 137
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
				 IO_REQ_CLEAN_FLAGS)

138 139
#define IO_TCTX_REFS_CACHE_NR	(1U << 10)

140 141
struct io_mapped_ubuf {
	u64		ubuf;
142
	u64		ubuf_end;
143
	unsigned int	nr_bvecs;
144
	unsigned long	acct_pages;
145
	struct bio_vec	bvec[];
146 147
};

148 149
struct io_ring_ctx;

150 151
struct io_overflow_cqe {
	struct list_head list;
152
	struct io_uring_cqe cqe;
153 154
};

155 156
struct io_rsrc_put {
	struct list_head list;
157
	u64 tag;
158 159 160
	union {
		void *rsrc;
		struct file *file;
161
		struct io_mapped_ubuf *buf;
162
	};
163 164
};

165
struct io_rsrc_node {
166 167
	struct percpu_ref		refs;
	struct list_head		node;
168
	struct list_head		rsrc_list;
169
	struct io_rsrc_data		*rsrc_data;
170
	struct llist_node		llist;
P
Pavel Begunkov 已提交
171
	bool				done;
172 173
};

174 175
typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);

176
struct io_rsrc_data {
177 178
	struct io_ring_ctx		*ctx;

179 180
	u64				**tags;
	unsigned int			nr;
181
	rsrc_put_fn			*do_put;
182
	atomic_t			refs;
183
	struct completion		done;
184
	bool				quiesce;
185 186
};

187
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
188
struct io_buffer_list {
189 190 191 192 193 194 195 196 197 198 199
	/*
	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
	 * then these are classic provided buffers and ->buf_list is used.
	 */
	union {
		struct list_head buf_list;
		struct {
			struct page **buf_pages;
			struct io_uring_buf_ring *buf_ring;
		};
	};
200
	__u16 bgid;
201 202 203 204

	/* below is for ring provided buffers */
	__u16 buf_nr_pages;
	__u16 nr_entries;
205 206
	__u16 head;
	__u16 mask;
207 208
};

209 210 211
struct io_buffer {
	struct list_head list;
	__u64 addr;
212
	__u32 len;
213
	__u16 bid;
214
	__u16 bgid;
215 216
};

217
#define IO_COMPL_BATCH			32
218
#define IO_REQ_CACHE_SIZE		32
P
Pavel Begunkov 已提交
219
#define IO_REQ_ALLOC_BATCH		8
220

J
Jens Axboe 已提交
221
#define BGID_ARRAY			64
J
Jens Axboe 已提交
222

223 224 225 226 227
/*
 * Arbitrary limit, can be raised if need be
 */
#define IO_RINGFD_REG_MAX 16

228 229
struct io_uring_task {
	/* submission side */
230
	int			cached_refs;
231 232
	struct xarray		xa;
	struct wait_queue_head	wait;
233 234
	const struct io_ring_ctx *last;
	struct io_wq		*io_wq;
235
	struct percpu_counter	inflight;
236
	atomic_t		inflight_tracked;
237 238 239 240
	atomic_t		in_idle;

	spinlock_t		task_lock;
	struct io_wq_work_list	task_list;
241
	struct io_wq_work_list	prio_task_list;
242
	struct callback_head	task_work;
243
	struct file		**registered_rings;
244
	bool			task_running;
245 246
};

J
Jens Axboe 已提交
247 248 249 250
/*
 * First field must be the file pointer in all the
 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 */
251
struct io_poll {
252
	struct file			*file;
253
	struct wait_queue_head		*head;
254
	__poll_t			events;
255
	struct wait_queue_entry		wait;
256 257
};

258
struct io_poll_update {
259
	struct file			*file;
260 261 262
	u64				old_user_data;
	u64				new_user_data;
	__poll_t			events;
263 264
	bool				update_events;
	bool				update_user_data;
265 266
};

267 268 269
struct io_cancel {
	struct file			*file;
	u64				addr;
270
	u32				flags;
271
	s32				fd;
272 273
};

274 275 276 277
struct io_rw {
	/* NOTE: kiocb has the file as the first member, so don't do it here */
	struct kiocb			kiocb;
	u64				addr;
278
	u32				len;
279
	rwf_t				flags;
280 281
};

282
struct io_rsrc_update {
283 284 285 286 287 288
	struct file			*file;
	u64				arg;
	u32				nr_args;
	u32				offset;
};

289 290 291
struct io_provide_buf {
	struct file			*file;
	__u64				addr;
292
	__u32				len;
293 294 295 296 297
	__u32				bgid;
	__u16				nbufs;
	__u16				bid;
};

P
Pavel Begunkov 已提交
298
struct io_rw_state {
299
	struct iov_iter			iter;
300
	struct iov_iter_state		iter_state;
301
	struct iovec			fast_iov[UIO_FASTIOV];
P
Pavel Begunkov 已提交
302 303 304 305 306
};

struct io_async_rw {
	struct io_rw_state		s;
	const struct iovec		*free_iovec;
307
	size_t				bytes_done;
308
	struct wait_page_queue		wpq;
309 310
};

311
struct async_poll {
312 313
	struct io_poll		poll;
	struct io_poll		*double_poll;
314 315
};

316 317 318 319 320
enum {
	IORING_RSRC_FILE		= 0,
	IORING_RSRC_BUFFER		= 1,
};

321 322
enum {
	IO_CHECK_CQ_OVERFLOW_BIT,
323
	IO_CHECK_CQ_DROPPED_BIT,
324 325
};

326 327 328 329 330 331
struct io_tctx_node {
	struct list_head	ctx_node;
	struct task_struct	*task;
	struct io_ring_ctx	*ctx;
};

332 333 334
struct io_defer_entry {
	struct list_head	list;
	struct io_kiocb		*req;
335
	u32			seq;
J
Jens Axboe 已提交
336 337
};

338 339 340
struct io_op_def {
	/* needs req->file assigned */
	unsigned		needs_file : 1;
P
Pavel Begunkov 已提交
341 342
	/* should block plug */
	unsigned		plug : 1;
343 344 345 346
	/* hash wq insertion if file is a regular file */
	unsigned		hash_reg_file : 1;
	/* unbound wq insertion if file is a non-regular file */
	unsigned		unbound_nonreg_file : 1;
347 348 349
	/* set if opcode supports polled "wait" */
	unsigned		pollin : 1;
	unsigned		pollout : 1;
350
	unsigned		poll_exclusive : 1;
351 352
	/* op supports buffer selection */
	unsigned		buffer_select : 1;
P
Pavel Begunkov 已提交
353 354
	/* opcode is not supported by this kernel */
	unsigned		not_supported : 1;
355 356
	/* skip auditing */
	unsigned		audit_skip : 1;
357 358 359 360
	/* supports ioprio */
	unsigned		ioprio : 1;
	/* supports iopoll */
	unsigned		iopoll : 1;
361 362
	/* size of async data needed, if any */
	unsigned short		async_size;
363

364 365
	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
	int (*issue)(struct io_kiocb *, unsigned int);
366
	int (*prep_async)(struct io_kiocb *);
367
	void (*cleanup)(struct io_kiocb *);
368 369
};

370 371
static const struct io_op_def io_op_defs[];

372 373
/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
374
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
375

376
static void io_uring_del_tctx_node(unsigned long index);
377 378
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
					 struct task_struct *task,
379
					 bool cancel_all);
380

381
static void io_dismantle_req(struct io_kiocb *req);
382
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
383
				     struct io_uring_rsrc_update2 *up,
384
				     unsigned nr_args);
385
static void io_clean_op(struct io_kiocb *req);
P
Pavel Begunkov 已提交
386
static void io_queue_sqe(struct io_kiocb *req);
387
static void io_rsrc_put_work(struct work_struct *work);
388

389
static void io_req_task_queue(struct io_kiocb *req);
390
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
391
static int io_req_prep_async(struct io_kiocb *req);
392

393
static void io_eventfd_signal(struct io_ring_ctx *ctx);
394

J
Jens Axboe 已提交
395 396
static struct kmem_cache *req_cachep;

397
static const struct file_operations io_uring_fops;
J
Jens Axboe 已提交
398

399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
const char *io_uring_get_opcode(u8 opcode)
{
	switch ((enum io_uring_op)opcode) {
	case IORING_OP_NOP:
		return "NOP";
	case IORING_OP_READV:
		return "READV";
	case IORING_OP_WRITEV:
		return "WRITEV";
	case IORING_OP_FSYNC:
		return "FSYNC";
	case IORING_OP_READ_FIXED:
		return "READ_FIXED";
	case IORING_OP_WRITE_FIXED:
		return "WRITE_FIXED";
	case IORING_OP_POLL_ADD:
		return "POLL_ADD";
	case IORING_OP_POLL_REMOVE:
		return "POLL_REMOVE";
	case IORING_OP_SYNC_FILE_RANGE:
		return "SYNC_FILE_RANGE";
	case IORING_OP_SENDMSG:
		return "SENDMSG";
	case IORING_OP_RECVMSG:
		return "RECVMSG";
	case IORING_OP_TIMEOUT:
		return "TIMEOUT";
	case IORING_OP_TIMEOUT_REMOVE:
		return "TIMEOUT_REMOVE";
	case IORING_OP_ACCEPT:
		return "ACCEPT";
	case IORING_OP_ASYNC_CANCEL:
		return "ASYNC_CANCEL";
	case IORING_OP_LINK_TIMEOUT:
		return "LINK_TIMEOUT";
	case IORING_OP_CONNECT:
		return "CONNECT";
	case IORING_OP_FALLOCATE:
		return "FALLOCATE";
	case IORING_OP_OPENAT:
		return "OPENAT";
	case IORING_OP_CLOSE:
		return "CLOSE";
	case IORING_OP_FILES_UPDATE:
		return "FILES_UPDATE";
	case IORING_OP_STATX:
		return "STATX";
	case IORING_OP_READ:
		return "READ";
	case IORING_OP_WRITE:
		return "WRITE";
	case IORING_OP_FADVISE:
		return "FADVISE";
	case IORING_OP_MADVISE:
		return "MADVISE";
	case IORING_OP_SEND:
		return "SEND";
	case IORING_OP_RECV:
		return "RECV";
	case IORING_OP_OPENAT2:
		return "OPENAT2";
	case IORING_OP_EPOLL_CTL:
		return "EPOLL_CTL";
	case IORING_OP_SPLICE:
		return "SPLICE";
	case IORING_OP_PROVIDE_BUFFERS:
		return "PROVIDE_BUFFERS";
	case IORING_OP_REMOVE_BUFFERS:
		return "REMOVE_BUFFERS";
	case IORING_OP_TEE:
		return "TEE";
	case IORING_OP_SHUTDOWN:
		return "SHUTDOWN";
	case IORING_OP_RENAMEAT:
		return "RENAMEAT";
	case IORING_OP_UNLINKAT:
		return "UNLINKAT";
	case IORING_OP_MKDIRAT:
		return "MKDIRAT";
	case IORING_OP_SYMLINKAT:
		return "SYMLINKAT";
	case IORING_OP_LINKAT:
		return "LINKAT";
	case IORING_OP_MSG_RING:
		return "MSG_RING";
	case IORING_OP_FSETXATTR:
		return "FSETXATTR";
	case IORING_OP_SETXATTR:
		return "SETXATTR";
	case IORING_OP_FGETXATTR:
		return "FGETXATTR";
	case IORING_OP_GETXATTR:
		return "GETXATTR";
	case IORING_OP_SOCKET:
		return "SOCKET";
494 495
	case IORING_OP_URING_CMD:
		return "URING_CMD";
496 497 498 499 500 501
	case IORING_OP_LAST:
		return "INVALID";
	}
	return "INVALID";
}

502 503 504 505 506
bool io_is_uring_fops(struct file *file)
{
	return file->f_op == &io_uring_fops;
}

J
Jens Axboe 已提交
507 508 509
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
510
	if (io_is_uring_fops(file)) {
J
Jens Axboe 已提交
511 512 513 514 515 516 517 518 519
		struct io_ring_ctx *ctx = file->private_data;

		return ctx->ring_sock->sk;
	}
#endif
	return NULL;
}
EXPORT_SYMBOL(io_uring_get_socket);

520 521 522
#if defined(CONFIG_UNIX)
static inline bool io_file_need_scm(struct file *filp)
{
523 524 525
#if defined(IO_URING_SCM_ALL)
	return true;
#else
526
	return !!unix_get_socket(filp);
527
#endif
528 529 530 531
}
#else
static inline bool io_file_need_scm(struct file *filp)
{
532
	return false;
533 534 535
}
#endif

536 537 538 539 540 541 542 543
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
	if (!*locked) {
		mutex_lock(&ctx->uring_lock);
		*locked = true;
	}
}

544 545
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
546
	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
547 548 549
		__io_submit_flush_completions(ctx);
}

550 551
#define IO_RSRC_REF_BATCH	100

552 553 554 555 556
static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
{
	percpu_ref_put_many(&node->refs, nr);
}

557 558 559
static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
					  struct io_ring_ctx *ctx)
	__must_hold(&ctx->uring_lock)
560
{
561
	struct io_rsrc_node *node = req->rsrc_node;
562

563 564
	if (node) {
		if (node == ctx->rsrc_node)
565 566
			ctx->rsrc_cached_refs++;
		else
567
			io_rsrc_put_node(node, 1);
568 569 570
	}
}

571
static inline void io_req_put_rsrc(struct io_kiocb *req)
572
{
573
	if (req->rsrc_node)
574
		io_rsrc_put_node(req->rsrc_node, 1);
575 576 577 578 579 580
}

static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
	__must_hold(&ctx->uring_lock)
{
	if (ctx->rsrc_cached_refs) {
581
		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
582 583 584 585 586 587 588 589 590 591
		ctx->rsrc_cached_refs = 0;
	}
}

static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
	__must_hold(&ctx->uring_lock)
{
	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
}
592

593
static inline void io_req_set_rsrc_node(struct io_kiocb *req,
594 595
					struct io_ring_ctx *ctx,
					unsigned int issue_flags)
596
{
597 598
	if (!req->rsrc_node) {
		req->rsrc_node = ctx->rsrc_node;
599 600 601 602 603 604 605

		if (!(issue_flags & IO_URING_F_UNLOCKED)) {
			lockdep_assert_held(&ctx->uring_lock);
			ctx->rsrc_cached_refs--;
			if (unlikely(ctx->rsrc_cached_refs < 0))
				io_rsrc_refs_refill(ctx);
		} else {
606
			percpu_ref_get(&req->rsrc_node->refs);
607
		}
608 609 610
	}
}

611
static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
612
{
613 614 615 616 617 618 619 620
	if (req->flags & REQ_F_BUFFER_RING) {
		if (req->buf_list)
			req->buf_list->head++;
		req->flags &= ~REQ_F_BUFFER_RING;
	} else {
		list_add(&req->kbuf->list, list);
		req->flags &= ~REQ_F_BUFFER_SELECTED;
	}
621

622
	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
623 624
}

625
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
626
{
627 628
	lockdep_assert_held(&req->ctx->completion_lock);

629
	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
630
		return 0;
631 632 633
	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
}

634
inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
635 636 637
{
	unsigned int cflags;

638
	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
639 640 641 642 643 644 645 646 647 648 649 650 651 652
		return 0;

	/*
	 * We can add this buffer back to two lists:
	 *
	 * 1) The io_buffers_cache list. This one is protected by the
	 *    ctx->uring_lock. If we already hold this lock, add back to this
	 *    list as we can grab it from issue as well.
	 * 2) The io_buffers_comp list. This one is protected by the
	 *    ctx->completion_lock.
	 *
	 * We migrate buffers from the comp_list to the issue cache list
	 * when we need one.
	 */
653 654 655 656
	if (req->flags & REQ_F_BUFFER_RING) {
		/* no buffers to recycle for this case */
		cflags = __io_put_kbuf(req, NULL);
	} else if (issue_flags & IO_URING_F_UNLOCKED) {
657 658 659 660 661 662
		struct io_ring_ctx *ctx = req->ctx;

		spin_lock(&ctx->completion_lock);
		cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
		spin_unlock(&ctx->completion_lock);
	} else {
663 664
		lockdep_assert_held(&req->ctx->uring_lock);

665 666 667 668
		cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
	}

	return cflags;
669 670
}

671 672 673
static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
						 unsigned int bgid)
{
674 675
	if (ctx->io_bl && bgid < BGID_ARRAY)
		return &ctx->io_bl[bgid];
676

677
	return xa_load(&ctx->io_bl_xa, bgid);
678 679
}

680
static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
681 682
{
	struct io_ring_ctx *ctx = req->ctx;
683 684
	struct io_buffer_list *bl;
	struct io_buffer *buf;
685

686
	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
687
		return;
688 689 690 691 692 693 694 695
	/*
	 * For legacy provided buffer mode, don't recycle if we already did
	 * IO to this buffer. For ring-mapped provided buffer mode, we should
	 * increment ring->head to explicitly monopolize the buffer to avoid
	 * multiple use.
	 */
	if ((req->flags & REQ_F_BUFFER_SELECTED) &&
	    (req->flags & REQ_F_PARTIAL_IO))
696
		return;
697

698 699 700 701 702 703 704 705
	/*
	 * READV uses fields in `struct io_rw` (len/addr) to stash the selected
	 * buffer data. However if that buffer is recycled the original request
	 * data stored in addr is lost. Therefore forbid recycling for now.
	 */
	if (req->opcode == IORING_OP_READV)
		return;

706 707 708 709 710 711 712
	/*
	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
	 * the flag and hence ensure that bl->head doesn't get incremented.
	 * If the tail has already been incremented, hang on to it.
	 */
	if (req->flags & REQ_F_BUFFER_RING) {
		if (req->buf_list) {
713 714 715 716 717 718 719
			if (req->flags & REQ_F_PARTIAL_IO) {
				req->buf_list->head++;
				req->buf_list = NULL;
			} else {
				req->buf_index = req->buf_list->bgid;
				req->flags &= ~REQ_F_BUFFER_RING;
			}
720 721 722
		}
		return;
	}
723

724
	io_ring_submit_lock(ctx, issue_flags);
725 726

	buf = req->kbuf;
727 728
	bl = io_buffer_get_list(ctx, buf->bgid);
	list_add(&buf->list, &bl->buf_list);
729
	req->flags &= ~REQ_F_BUFFER_SELECTED;
730
	req->buf_index = buf->bgid;
731

732
	io_ring_submit_unlock(ctx, issue_flags);
733 734
}

735 736 737 738 739 740 741 742 743
static bool io_match_linked(struct io_kiocb *head)
{
	struct io_kiocb *req;

	io_for_each_link(req, head) {
		if (req->flags & REQ_F_INFLIGHT)
			return true;
	}
	return false;
744 745 746 747 748 749 750 751 752
}

/*
 * As io_match_task() but protected against racing with linked timeouts.
 * User must not hold timeout_lock.
 */
static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
			       bool cancel_all)
{
753 754
	bool matched;

755 756
	if (task && head->task != task)
		return false;
757 758 759 760 761 762 763 764 765 766 767 768 769 770
	if (cancel_all)
		return true;

	if (head->flags & REQ_F_LINK_TIMEOUT) {
		struct io_ring_ctx *ctx = head->ctx;

		/* protect against races with linked timeouts */
		spin_lock_irq(&ctx->timeout_lock);
		matched = io_match_linked(head);
		spin_unlock_irq(&ctx->timeout_lock);
	} else {
		matched = io_match_linked(head);
	}
	return matched;
771 772
}

773 774 775
static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
	req_set_fail(req);
776
	io_req_set_res(req, res, 0);
777 778
}

779 780 781
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
782 783
}

P
Pavel Begunkov 已提交
784
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
J
Jens Axboe 已提交
785 786 787
{
	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);

788
	complete(&ctx->ref_comp);
J
Jens Axboe 已提交
789 790
}

P
Pavel Begunkov 已提交
791
static __cold void io_fallback_req_func(struct work_struct *work)
792 793 794 795 796
{
	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
						fallback_work.work);
	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
	struct io_kiocb *req, *tmp;
797
	bool locked = false;
798 799 800

	percpu_ref_get(&ctx->refs);
	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
801
		req->io_task_work.func(req, &locked);
802

803
	if (locked) {
804
		io_submit_flush_completions(ctx);
805 806
		mutex_unlock(&ctx->uring_lock);
	}
807 808 809
	percpu_ref_put(&ctx->refs);
}

P
Pavel Begunkov 已提交
810
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
J
Jens Axboe 已提交
811 812
{
	struct io_ring_ctx *ctx;
813
	int hash_bits;
J
Jens Axboe 已提交
814 815 816 817 818

	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
	if (!ctx)
		return NULL;

819 820
	xa_init(&ctx->io_bl_xa);

821 822 823 824 825 826 827 828 829 830 831 832 833 834 835
	/*
	 * Use 5 bits less than the max cq entries, that should give us around
	 * 32 entries per hash list if totally full and uniformly spread.
	 */
	hash_bits = ilog2(p->cq_entries);
	hash_bits -= 5;
	if (hash_bits <= 0)
		hash_bits = 1;
	ctx->cancel_hash_bits = hash_bits;
	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
					GFP_KERNEL);
	if (!ctx->cancel_hash)
		goto err;
	__hash_init(ctx->cancel_hash, 1U << hash_bits);

836 837 838 839 840 841
	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
	if (!ctx->dummy_ubuf)
		goto err;
	/* set invalid range, so io_import_fixed() fails meeting it */
	ctx->dummy_ubuf->ubuf = -1UL;

842
	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
843 844
			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
		goto err;
J
Jens Axboe 已提交
845 846

	ctx->flags = p->flags;
847
	init_waitqueue_head(&ctx->sqo_sq_wait);
848
	INIT_LIST_HEAD(&ctx->sqd_list);
849
	INIT_LIST_HEAD(&ctx->cq_overflow_list);
850
	INIT_LIST_HEAD(&ctx->io_buffers_cache);
851
	INIT_LIST_HEAD(&ctx->apoll_cache);
852
	init_completion(&ctx->ref_comp);
853
	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
J
Jens Axboe 已提交
854
	mutex_init(&ctx->uring_lock);
P
Pavel Begunkov 已提交
855
	init_waitqueue_head(&ctx->cq_wait);
J
Jens Axboe 已提交
856
	spin_lock_init(&ctx->completion_lock);
857
	spin_lock_init(&ctx->timeout_lock);
858
	INIT_WQ_LIST(&ctx->iopoll_list);
859 860
	INIT_LIST_HEAD(&ctx->io_buffers_pages);
	INIT_LIST_HEAD(&ctx->io_buffers_comp);
861
	INIT_LIST_HEAD(&ctx->defer_list);
J
Jens Axboe 已提交
862
	INIT_LIST_HEAD(&ctx->timeout_list);
863
	INIT_LIST_HEAD(&ctx->ltimeout_list);
864 865
	spin_lock_init(&ctx->rsrc_ref_lock);
	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
866 867
	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
	init_llist_head(&ctx->rsrc_put_llist);
868
	INIT_LIST_HEAD(&ctx->tctx_list);
869 870
	ctx->submit_state.free_list.next = NULL;
	INIT_WQ_LIST(&ctx->locked_free_list);
871
	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
872
	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
J
Jens Axboe 已提交
873
	return ctx;
874
err:
875
	kfree(ctx->dummy_ubuf);
876
	kfree(ctx->cancel_hash);
877 878
	kfree(ctx->io_bl);
	xa_destroy(&ctx->io_bl_xa);
879 880
	kfree(ctx);
	return NULL;
J
Jens Axboe 已提交
881 882
}

883 884 885 886 887 888 889 890
static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{
	struct io_rings *r = ctx->rings;

	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
	ctx->cq_extra--;
}

891
static bool req_need_defer(struct io_kiocb *req, u32 seq)
892
{
893 894
	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
		struct io_ring_ctx *ctx = req->ctx;
895

896
		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
897
	}
898

B
Bob Liu 已提交
899
	return false;
900 901
}

902 903
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
904
	return req->flags & REQ_F_FIXED_FILE;
905 906
}

907 908 909 910
static inline void io_req_track_inflight(struct io_kiocb *req)
{
	if (!(req->flags & REQ_F_INFLIGHT)) {
		req->flags |= REQ_F_INFLIGHT;
911
		atomic_inc(&req->task->io_uring->inflight_tracked);
912 913 914
	}
}

915 916
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
917 918 919
	if (WARN_ON_ONCE(!req->link))
		return NULL;

920 921
	req->flags &= ~REQ_F_ARM_LTIMEOUT;
	req->flags |= REQ_F_LINK_TIMEOUT;
922 923

	/* linked timeouts should have two refs once prep'ed */
924
	io_req_set_refcount(req);
925 926
	__io_req_set_refcount(req->link, 2);
	return req->link;
927 928 929 930
}

static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
931
	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
932 933 934 935
		return NULL;
	return __io_prep_linked_timeout(req);
}

936 937 938 939 940 941 942 943 944 945 946
static noinline void __io_arm_ltimeout(struct io_kiocb *req)
{
	io_queue_linked_timeout(__io_prep_linked_timeout(req));
}

static inline void io_arm_ltimeout(struct io_kiocb *req)
{
	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
		__io_arm_ltimeout(req);
}

J
Jens Axboe 已提交
947 948 949 950 951
static void io_prep_async_work(struct io_kiocb *req)
{
	const struct io_op_def *def = &io_op_defs[req->opcode];
	struct io_ring_ctx *ctx = req->ctx;

952 953
	if (!(req->flags & REQ_F_CREDS)) {
		req->flags |= REQ_F_CREDS;
954
		req->creds = get_current_cred();
955
	}
956

957 958
	req->work.list.next = NULL;
	req->work.flags = 0;
959
	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
960 961 962
	if (req->flags & REQ_F_FORCE_ASYNC)
		req->work.flags |= IO_WQ_WORK_CONCURRENT;

J
Jens Axboe 已提交
963 964 965
	if (req->flags & REQ_F_ISREG) {
		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
			io_wq_hash_work(&req->work, file_inode(req->file));
966
	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
J
Jens Axboe 已提交
967 968 969
		if (def->unbound_nonreg_file)
			req->work.flags |= IO_WQ_WORK_UNBOUND;
	}
970
}
971

972
static void io_prep_async_link(struct io_kiocb *req)
973
{
974
	struct io_kiocb *cur;
975

976 977 978
	if (req->flags & REQ_F_LINK_TIMEOUT) {
		struct io_ring_ctx *ctx = req->ctx;

979
		spin_lock_irq(&ctx->timeout_lock);
980 981
		io_for_each_link(cur, req)
			io_prep_async_work(cur);
982
		spin_unlock_irq(&ctx->timeout_lock);
983 984 985 986
	} else {
		io_for_each_link(cur, req)
			io_prep_async_work(cur);
	}
987 988
}

989 990
static inline void io_req_add_compl_list(struct io_kiocb *req)
{
991
	struct io_submit_state *state = &req->ctx->submit_state;
992

993
	if (!(req->flags & REQ_F_CQE_SKIP))
994
		state->flush_cqes = true;
995 996 997
	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}

998
static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
999
{
1000
	struct io_kiocb *link = io_prep_linked_timeout(req);
1001
	struct io_uring_task *tctx = req->task->io_uring;
1002

1003 1004
	BUG_ON(!tctx);
	BUG_ON(!tctx->io_wq);
1005

1006 1007
	/* init ->work of the whole link before punting */
	io_prep_async_link(req);
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018

	/*
	 * Not expected to happen, but if we do have a bug where this _can_
	 * happen, catch it here and ensure the request is marked as
	 * canceled. That will make io-wq go through the usual work cancel
	 * procedure rather than attempt to run this request (or create a new
	 * worker for it).
	 */
	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
		req->work.flags |= IO_WQ_WORK_CANCEL;

1019 1020 1021
	trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
					req->opcode, req->flags, &req->work,
					io_wq_is_hashed(&req->work));
1022
	io_wq_enqueue(tctx->io_wq, &req->work);
1023 1024
	if (link)
		io_queue_linked_timeout(link);
1025 1026
}

P
Pavel Begunkov 已提交
1027
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
1028
{
1029
	while (!list_empty(&ctx->defer_list)) {
1030 1031
		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
						struct io_defer_entry, list);
1032

1033
		if (req_need_defer(de->req, de->seq))
1034
			break;
1035
		list_del_init(&de->list);
1036
		io_req_task_queue(de->req);
1037
		kfree(de);
1038
	}
1039 1040
}

1041
static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1042
{
1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
	if (ctx->off_timeout_used || ctx->drain_active) {
		spin_lock(&ctx->completion_lock);
		if (ctx->off_timeout_used)
			io_flush_timeouts(ctx);
		if (ctx->drain_active)
			io_queue_deferred(ctx);
		io_commit_cqring(ctx);
		spin_unlock(&ctx->completion_lock);
	}
	if (ctx->has_evfd)
		io_eventfd_signal(ctx);
1054 1055
}

1056 1057 1058 1059 1060
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}

P
Pavel Begunkov 已提交
1061 1062 1063 1064 1065 1066
/*
 * writes to the cq entry need to come after reading head; the
 * control dependency is enough as we're using WRITE_ONCE to
 * fill the cq entry
 */
static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
1067
{
1068
	struct io_rings *rings = ctx->rings;
P
Pavel Begunkov 已提交
1069
	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
1070
	unsigned int shift = 0;
P
Pavel Begunkov 已提交
1071 1072
	unsigned int free, queued, len;

1073 1074 1075
	if (ctx->flags & IORING_SETUP_CQE32)
		shift = 1;

P
Pavel Begunkov 已提交
1076 1077 1078 1079 1080 1081
	/* userspace may cheat modifying the tail, be safe and do min */
	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
	free = ctx->cq_entries - queued;
	/* we need a contiguous range, limit based on the current array offset */
	len = min(free, ctx->cq_entries - off);
	if (!len)
J
Jens Axboe 已提交
1082 1083
		return NULL;

P
Pavel Begunkov 已提交
1084 1085 1086
	ctx->cached_cq_tail++;
	ctx->cqe_cached = &rings->cqes[off];
	ctx->cqe_sentinel = ctx->cqe_cached + len;
1087 1088
	ctx->cqe_cached++;
	return &rings->cqes[off << shift];
P
Pavel Begunkov 已提交
1089 1090 1091 1092 1093
}

static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
{
	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
1094 1095 1096 1097 1098 1099 1100 1101
		struct io_uring_cqe *cqe = ctx->cqe_cached;

		if (ctx->flags & IORING_SETUP_CQE32) {
			unsigned int off = ctx->cqe_cached - ctx->rings->cqes;

			cqe += off;
		}

P
Pavel Begunkov 已提交
1102
		ctx->cached_cq_tail++;
1103 1104
		ctx->cqe_cached++;
		return cqe;
P
Pavel Begunkov 已提交
1105
	}
1106

P
Pavel Begunkov 已提交
1107
	return __io_get_cqe(ctx);
J
Jens Axboe 已提交
1108 1109
}

1110
static void io_eventfd_signal(struct io_ring_ctx *ctx)
1111
{
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
	struct io_ev_fd *ev_fd;

	rcu_read_lock();
	/*
	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
	 * and eventfd_signal
	 */
	ev_fd = rcu_dereference(ctx->io_ev_fd);

	/*
	 * Check again if ev_fd exists incase an io_eventfd_unregister call
	 * completed between the NULL check of ctx->io_ev_fd at the start of
	 * the function and rcu_read_lock.
	 */
	if (unlikely(!ev_fd))
		goto out;
1128
	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1129 1130
		goto out;

1131
	if (!ev_fd->eventfd_async || io_wq_current_is_worker())
1132 1133 1134
		eventfd_signal(ev_fd->cq_ev_fd, 1);
out:
	rcu_read_unlock();
1135 1136
}

1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
	/*
	 * wake_up_all() may seem excessive, but io_wake_function() and
	 * io_should_wake() handle the termination of the loop and only
	 * wake as many waiters as we need to.
	 */
	if (wq_has_sleeper(&ctx->cq_wait))
		wake_up_all(&ctx->cq_wait);
}

1148 1149 1150 1151 1152 1153 1154
/*
 * This should only get called when at least one event has been posted.
 * Some applications rely on the eventfd notification count only changing
 * IFF a new CQE has been added to the CQ ring. There's no depedency on
 * 1:1 relationship between how many times this function is called (and
 * hence the eventfd count) and number of CQEs posted to the CQ ring.
 */
1155
void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1156
{
1157 1158
	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
		     ctx->has_evfd))
1159 1160
		__io_commit_cqring_flush(ctx);

1161
	io_cqring_wake(ctx);
1162 1163
}

1164 1165
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
1166 1167
	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
		     ctx->has_evfd))
1168 1169
		__io_commit_cqring_flush(ctx);

1170 1171
	if (ctx->flags & IORING_SETUP_SQPOLL)
		io_cqring_wake(ctx);
1172 1173
}

1174
/* Returns true if there are no backlogged entries after the flush */
1175
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1176
{
1177
	bool all_flushed, posted;
1178
	size_t cqe_size = sizeof(struct io_uring_cqe);
1179

1180
	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1181
		return false;
1182

1183 1184 1185
	if (ctx->flags & IORING_SETUP_CQE32)
		cqe_size <<= 1;

1186
	posted = false;
1187
	spin_lock(&ctx->completion_lock);
1188
	while (!list_empty(&ctx->cq_overflow_list)) {
P
Pavel Begunkov 已提交
1189
		struct io_uring_cqe *cqe = io_get_cqe(ctx);
1190
		struct io_overflow_cqe *ocqe;
1191

1192 1193
		if (!cqe && !force)
			break;
1194 1195 1196
		ocqe = list_first_entry(&ctx->cq_overflow_list,
					struct io_overflow_cqe, list);
		if (cqe)
1197
			memcpy(cqe, &ocqe->cqe, cqe_size);
1198
		else
1199 1200
			io_account_cq_overflow(ctx);

1201
		posted = true;
1202 1203
		list_del(&ocqe->list);
		kfree(ocqe);
1204 1205
	}

1206 1207
	all_flushed = list_empty(&ctx->cq_overflow_list);
	if (all_flushed) {
1208
		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
1209
		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
1210
	}
1211

1212
	io_commit_cqring(ctx);
1213
	spin_unlock(&ctx->completion_lock);
1214 1215
	if (posted)
		io_cqring_ev_posted(ctx);
1216
	return all_flushed;
1217 1218
}

1219
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1220
{
1221 1222
	bool ret = true;

1223
	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
1224 1225 1226
		/* iopoll syncs against uring_lock, not completion_lock */
		if (ctx->flags & IORING_SETUP_IOPOLL)
			mutex_lock(&ctx->uring_lock);
1227
		ret = __io_cqring_overflow_flush(ctx, false);
1228 1229 1230
		if (ctx->flags & IORING_SETUP_IOPOLL)
			mutex_unlock(&ctx->uring_lock);
	}
1231 1232

	return ret;
1233 1234
}

1235
static void __io_put_task(struct task_struct *task, int nr)
1236 1237 1238
{
	struct io_uring_task *tctx = task->io_uring;

1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251
	percpu_counter_sub(&tctx->inflight, nr);
	if (unlikely(atomic_read(&tctx->in_idle)))
		wake_up(&tctx->wait);
	put_task_struct_many(task, nr);
}

/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
	if (likely(task == current))
		task->io_uring->cached_refs += nr;
	else
		__io_put_task(task, nr);
1252 1253
}

1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271
static void io_task_refs_refill(struct io_uring_task *tctx)
{
	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;

	percpu_counter_add(&tctx->inflight, refill);
	refcount_add(refill, &current->usage);
	tctx->cached_refs += refill;
}

static inline void io_get_task_refs(int nr)
{
	struct io_uring_task *tctx = current->io_uring;

	tctx->cached_refs -= nr;
	if (unlikely(tctx->cached_refs < 0))
		io_task_refs_refill(tctx);
}

1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
	struct io_uring_task *tctx = task->io_uring;
	unsigned int refs = tctx->cached_refs;

	if (refs) {
		tctx->cached_refs = 0;
		percpu_counter_sub(&tctx->inflight, refs);
		put_task_struct_many(task, refs);
	}
}

1284
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1285 1286
				     s32 res, u32 cflags, u64 extra1,
				     u64 extra2)
J
Jens Axboe 已提交
1287
{
1288
	struct io_overflow_cqe *ocqe;
1289 1290
	size_t ocq_size = sizeof(struct io_overflow_cqe);
	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
J
Jens Axboe 已提交
1291

1292 1293
	if (is_cqe32)
		ocq_size += sizeof(struct io_uring_cqe);
J
Jens Axboe 已提交
1294

1295
	ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
D
Dylan Yudaken 已提交
1296
	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
1297 1298 1299 1300 1301 1302
	if (!ocqe) {
		/*
		 * If we're in ring overflow flush mode, or in task cancel mode,
		 * or cannot allocate an overflow entry, then we need to drop it
		 * on the floor.
		 */
1303
		io_account_cq_overflow(ctx);
1304
		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
1305
		return false;
J
Jens Axboe 已提交
1306
	}
1307
	if (list_empty(&ctx->cq_overflow_list)) {
1308
		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
1309
		atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
1310

1311
	}
1312
	ocqe->cqe.user_data = user_data;
1313 1314
	ocqe->cqe.res = res;
	ocqe->cqe.flags = cflags;
1315 1316 1317 1318
	if (is_cqe32) {
		ocqe->cqe.big_cqe[0] = extra1;
		ocqe->cqe.big_cqe[1] = extra2;
	}
1319 1320
	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
	return true;
J
Jens Axboe 已提交
1321 1322
}

1323 1324
static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
				     struct io_kiocb *req)
1325
{
P
Pavel Begunkov 已提交
1326 1327
	struct io_uring_cqe *cqe;

1328 1329 1330
	if (!(ctx->flags & IORING_SETUP_CQE32)) {
		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
					req->cqe.res, req->cqe.flags, 0, 0);
P
Pavel Begunkov 已提交
1331

1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346
		/*
		 * If we can't get a cq entry, userspace overflowed the
		 * submission (by quite a lot). Increment the overflow count in
		 * the ring.
		 */
		cqe = io_get_cqe(ctx);
		if (likely(cqe)) {
			memcpy(cqe, &req->cqe, sizeof(*cqe));
			return true;
		}

		return io_cqring_event_overflow(ctx, req->cqe.user_data,
						req->cqe.res, req->cqe.flags,
						0, 0);
	} else {
1347 1348 1349 1350 1351 1352
		u64 extra1 = 0, extra2 = 0;

		if (req->flags & REQ_F_CQE32_INIT) {
			extra1 = req->extra1;
			extra2 = req->extra2;
		}
1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372

		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
					req->cqe.res, req->cqe.flags, extra1, extra2);

		/*
		 * If we can't get a cq entry, userspace overflowed the
		 * submission (by quite a lot). Increment the overflow count in
		 * the ring.
		 */
		cqe = io_get_cqe(ctx);
		if (likely(cqe)) {
			memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
			WRITE_ONCE(cqe->big_cqe[0], extra1);
			WRITE_ONCE(cqe->big_cqe[1], extra2);
			return true;
		}

		return io_cqring_event_overflow(ctx, req->cqe.user_data,
				req->cqe.res, req->cqe.flags,
				extra1, extra2);
P
Pavel Begunkov 已提交
1373
	}
1374 1375
}

1376 1377
bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
		     u32 cflags)
1378
{
1379 1380
	struct io_uring_cqe *cqe;

1381
	ctx->cq_extra++;
1382
	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393

	/*
	 * If we can't get a cq entry, userspace overflowed the
	 * submission (by quite a lot). Increment the overflow count in
	 * the ring.
	 */
	cqe = io_get_cqe(ctx);
	if (likely(cqe)) {
		WRITE_ONCE(cqe->user_data, user_data);
		WRITE_ONCE(cqe->res, res);
		WRITE_ONCE(cqe->flags, cflags);
1394 1395 1396 1397 1398

		if (ctx->flags & IORING_SETUP_CQE32) {
			WRITE_ONCE(cqe->big_cqe[0], 0);
			WRITE_ONCE(cqe->big_cqe[1], 0);
		}
1399 1400 1401
		return true;
	}
	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
1402 1403
}

1404
static void __io_req_complete_put(struct io_kiocb *req)
J
Jens Axboe 已提交
1405
{
1406 1407 1408 1409
	/*
	 * If we're the last reference to this request, add to our locked
	 * free_list cache.
	 */
1410
	if (req_ref_put_and_test(req)) {
1411 1412
		struct io_ring_ctx *ctx = req->ctx;

1413
		if (req->flags & IO_REQ_LINK_FLAGS) {
1414
			if (req->flags & IO_DISARM_MASK)
1415 1416 1417 1418 1419 1420
				io_disarm_next(req);
			if (req->link) {
				io_req_task_queue(req->link);
				req->link = NULL;
			}
		}
1421
		io_req_put_rsrc(req);
1422 1423 1424 1425 1426 1427
		/*
		 * Selected buffer deallocation in io_clean_op() assumes that
		 * we don't hold ->completion_lock. Clean them here to avoid
		 * deadlocks.
		 */
		io_put_kbuf_comp(req);
1428 1429
		io_dismantle_req(req);
		io_put_task(req->task, 1);
1430
		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
1431
		ctx->locked_free_nr++;
1432
	}
1433 1434
}

1435
void __io_req_complete_post(struct io_kiocb *req)
1436
{
1437
	if (!(req->flags & REQ_F_CQE_SKIP))
1438
		__io_fill_cqe_req(req->ctx, req);
1439 1440 1441
	__io_req_complete_put(req);
}

1442
void io_req_complete_post(struct io_kiocb *req)
1443 1444 1445 1446
{
	struct io_ring_ctx *ctx = req->ctx;

	spin_lock(&ctx->completion_lock);
1447
	__io_req_complete_post(req);
1448
	io_commit_cqring(ctx);
1449
	spin_unlock(&ctx->completion_lock);
1450
	io_cqring_ev_posted(ctx);
1451 1452
}

1453
inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
1454
{
1455
	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1456
		req->flags |= REQ_F_COMPLETE_INLINE;
1457 1458
	else
		io_req_complete_post(req);
1459 1460
}

1461
static void io_req_complete_failed(struct io_kiocb *req, s32 res)
1462
{
1463
	req_set_fail(req);
1464 1465
	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
	io_req_complete_post(req);
1466 1467
}

P
Pavel Begunkov 已提交
1468 1469 1470 1471 1472 1473 1474 1475 1476 1477
/*
 * Don't initialise the fields below on every allocation, but do that in
 * advance and keep them valid across allocations.
 */
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
	req->ctx = ctx;
	req->link = NULL;
	req->async_data = NULL;
	/* not necessary, but safer to zero */
1478
	req->cqe.res = 0;
P
Pavel Begunkov 已提交
1479 1480
}

1481
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1482
					struct io_submit_state *state)
1483
{
1484
	spin_lock(&ctx->completion_lock);
1485
	wq_list_splice(&ctx->locked_free_list, &state->free_list);
1486
	ctx->locked_free_nr = 0;
1487
	spin_unlock(&ctx->completion_lock);
1488 1489
}

1490
static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
1491
{
1492
	return !ctx->submit_state.free_list.next;
1493 1494
}

1495 1496 1497 1498 1499 1500
/*
 * A request might get retired back into the request caches even before opcode
 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 * Because of that, io_alloc_req() should be called only under ->uring_lock
 * and with extra caution to not get a request that is still worked on.
 */
P
Pavel Begunkov 已提交
1501
static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
1502
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
1503
{
P
Pavel Begunkov 已提交
1504
	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1505
	void *reqs[IO_REQ_ALLOC_BATCH];
P
Pavel Begunkov 已提交
1506
	int ret, i;
1507

1508 1509 1510 1511 1512
	/*
	 * If we have more than a batch's worth of requests in our IRQ side
	 * locked cache, grab the lock and move them over to our submission
	 * side cache.
	 */
1513
	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
1514
		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
1515
		if (!io_req_cache_empty(ctx))
1516 1517
			return true;
	}
1518

1519
	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
1520

P
Pavel Begunkov 已提交
1521 1522 1523 1524 1525
	/*
	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
	 * retry single alloc to be on the safe side.
	 */
	if (unlikely(ret <= 0)) {
1526 1527
		reqs[0] = kmem_cache_alloc(req_cachep, gfp);
		if (!reqs[0])
1528
			return false;
P
Pavel Begunkov 已提交
1529
		ret = 1;
J
Jens Axboe 已提交
1530
	}
P
Pavel Begunkov 已提交
1531

1532
	percpu_ref_get_many(&ctx->refs, ret);
1533
	for (i = 0; i < ret; i++) {
1534
		struct io_kiocb *req = reqs[i];
1535 1536

		io_preinit_req(req, ctx);
1537
		io_req_add_to_cache(req, ctx);
1538
	}
1539 1540 1541 1542 1543
	return true;
}

static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
{
1544
	if (unlikely(io_req_cache_empty(ctx)))
1545 1546 1547 1548 1549 1550 1551 1552 1553
		return __io_alloc_req_refill(ctx);
	return true;
}

static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
{
	struct io_wq_work_node *node;

	node = wq_stack_extract(&ctx->submit_state.free_list);
1554
	return container_of(node, struct io_kiocb, comp_list);
J
Jens Axboe 已提交
1555 1556
}

P
Pavel Begunkov 已提交
1557
static inline void io_dismantle_req(struct io_kiocb *req)
J
Jens Axboe 已提交
1558
{
1559
	unsigned int flags = req->flags;
1560

1561
	if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
1562
		io_clean_op(req);
1563 1564
	if (!(flags & REQ_F_FIXED_FILE))
		io_put_file(req->file);
1565 1566
}

1567
__cold void io_free_req(struct io_kiocb *req)
1568
{
1569
	struct io_ring_ctx *ctx = req->ctx;
1570

1571
	io_req_put_rsrc(req);
1572
	io_dismantle_req(req);
1573
	io_put_task(req->task, 1);
1574

1575
	spin_lock(&ctx->completion_lock);
1576
	wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
1577
	ctx->locked_free_nr++;
1578
	spin_unlock(&ctx->completion_lock);
1579 1580
}

1581 1582 1583 1584 1585 1586 1587
static void __io_req_find_next_prep(struct io_kiocb *req)
{
	struct io_ring_ctx *ctx = req->ctx;
	bool posted;

	spin_lock(&ctx->completion_lock);
	posted = io_disarm_next(req);
1588
	io_commit_cqring(ctx);
1589 1590 1591 1592 1593 1594
	spin_unlock(&ctx->completion_lock);
	if (posted)
		io_cqring_ev_posted(ctx);
}

static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1595
{
1596
	struct io_kiocb *nxt;
1597

J
Jens Axboe 已提交
1598 1599 1600 1601 1602 1603
	/*
	 * If LINK is set, we have dependent requests in this chain. If we
	 * didn't fail this request, queue the first one up, moving any other
	 * dependencies to the next request. In case of failure, fail the rest
	 * of the chain.
	 */
1604 1605
	if (unlikely(req->flags & IO_DISARM_MASK))
		__io_req_find_next_prep(req);
1606 1607 1608
	nxt = req->link;
	req->link = NULL;
	return nxt;
1609
}
J
Jens Axboe 已提交
1610

1611
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
1612 1613 1614
{
	if (!ctx)
		return;
1615 1616
	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
1617
	if (*locked) {
1618
		io_submit_flush_completions(ctx);
1619
		mutex_unlock(&ctx->uring_lock);
1620
		*locked = false;
1621 1622 1623 1624
	}
	percpu_ref_put(&ctx->refs);
}

1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
{
	io_commit_cqring(ctx);
	spin_unlock(&ctx->completion_lock);
	io_cqring_ev_posted(ctx);
}

static void handle_prev_tw_list(struct io_wq_work_node *node,
				struct io_ring_ctx **ctx, bool *uring_locked)
{
	if (*ctx && !*uring_locked)
		spin_lock(&(*ctx)->completion_lock);

	do {
		struct io_wq_work_node *next = node->next;
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);

1643 1644
		prefetch(container_of(next, struct io_kiocb, io_task_work.node));

1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
		if (req->ctx != *ctx) {
			if (unlikely(!*uring_locked && *ctx))
				ctx_commit_and_unlock(*ctx);

			ctx_flush_and_put(*ctx, uring_locked);
			*ctx = req->ctx;
			/* if not contended, grab and improve batching */
			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
			percpu_ref_get(&(*ctx)->refs);
			if (unlikely(!*uring_locked))
				spin_lock(&(*ctx)->completion_lock);
		}
1657
		if (likely(*uring_locked)) {
1658
			req->io_task_work.func(req, uring_locked);
1659 1660 1661 1662
		} else {
			req->cqe.flags = io_put_kbuf_comp(req);
			__io_req_complete_post(req);
		}
1663 1664 1665 1666 1667 1668 1669 1670 1671
		node = next;
	} while (node);

	if (unlikely(!*uring_locked))
		ctx_commit_and_unlock(*ctx);
}

static void handle_tw_list(struct io_wq_work_node *node,
			   struct io_ring_ctx **ctx, bool *locked)
1672 1673 1674 1675 1676 1677
{
	do {
		struct io_wq_work_node *next = node->next;
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);

1678 1679
		prefetch(container_of(next, struct io_kiocb, io_task_work.node));

1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691
		if (req->ctx != *ctx) {
			ctx_flush_and_put(*ctx, locked);
			*ctx = req->ctx;
			/* if not contended, grab and improve batching */
			*locked = mutex_trylock(&(*ctx)->uring_lock);
			percpu_ref_get(&(*ctx)->refs);
		}
		req->io_task_work.func(req, locked);
		node = next;
	} while (node);
}

1692
static void tctx_task_work(struct callback_head *cb)
1693
{
1694
	bool uring_locked = false;
1695
	struct io_ring_ctx *ctx = NULL;
1696 1697
	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
						  task_work);
1698

1699
	while (1) {
1700
		struct io_wq_work_node *node1, *node2;
1701 1702

		spin_lock_irq(&tctx->task_lock);
1703
		node1 = tctx->prio_task_list.first;
1704
		node2 = tctx->task_list.first;
1705
		INIT_WQ_LIST(&tctx->task_list);
1706
		INIT_WQ_LIST(&tctx->prio_task_list);
1707
		if (!node2 && !node1)
1708
			tctx->task_running = false;
1709
		spin_unlock_irq(&tctx->task_lock);
1710
		if (!node2 && !node1)
1711
			break;
1712

1713 1714 1715 1716
		if (node1)
			handle_prev_tw_list(node1, &ctx, &uring_locked);
		if (node2)
			handle_tw_list(node2, &ctx, &uring_locked);
1717
		cond_resched();
1718

1719
		if (data_race(!tctx->task_list.first) &&
1720
		    data_race(!tctx->prio_task_list.first) && uring_locked)
1721
			io_submit_flush_completions(ctx);
1722
	}
1723

1724
	ctx_flush_and_put(ctx, &uring_locked);
1725 1726 1727 1728

	/* relaxed read is enough as only the task itself sets ->in_idle */
	if (unlikely(atomic_read(&tctx->in_idle)))
		io_uring_drop_tctx_refs(current);
1729 1730
}

1731 1732 1733
static void __io_req_task_work_add(struct io_kiocb *req,
				   struct io_uring_task *tctx,
				   struct io_wq_work_list *list)
1734
{
1735
	struct io_ring_ctx *ctx = req->ctx;
1736
	struct io_wq_work_node *node;
1737
	unsigned long flags;
1738
	bool running;
1739

1740
	spin_lock_irqsave(&tctx->task_lock, flags);
1741
	wq_list_add_tail(&req->io_task_work.node, list);
1742 1743 1744
	running = tctx->task_running;
	if (!running)
		tctx->task_running = true;
1745
	spin_unlock_irqrestore(&tctx->task_lock, flags);
1746 1747

	/* task_work already pending, we're done */
1748
	if (running)
1749
		return;
1750

1751 1752 1753
	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);

1754
	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
1755
		return;
1756

1757
	spin_lock_irqsave(&tctx->task_lock, flags);
1758
	tctx->task_running = false;
1759
	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
1760
	spin_unlock_irqrestore(&tctx->task_lock, flags);
1761

1762 1763 1764 1765 1766 1767 1768
	while (node) {
		req = container_of(node, struct io_kiocb, io_task_work.node);
		node = node->next;
		if (llist_add(&req->io_task_work.fallback_node,
			      &req->ctx->fallback_llist))
			schedule_delayed_work(&req->ctx->fallback_work, 1);
	}
1769 1770
}

1771
void io_req_task_work_add(struct io_kiocb *req)
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787
{
	struct io_uring_task *tctx = req->task->io_uring;

	__io_req_task_work_add(req, tctx, &tctx->task_list);
}

static void io_req_task_prio_work_add(struct io_kiocb *req)
{
	struct io_uring_task *tctx = req->task->io_uring;

	if (req->ctx->flags & IORING_SETUP_SQPOLL)
		__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
	else
		__io_req_task_work_add(req, tctx, &tctx->task_list);
}

1788
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
1789
{
1790
	io_req_complete_post(req);
1791
}
1792

1793
void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
1794
{
1795
	io_req_set_res(req, res, cflags);
1796
	req->io_task_work.func = io_req_tw_post;
1797
	io_req_task_work_add(req);
1798 1799
}

1800
static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
1801
{
1802
	/* not needed for normal modes, but SQPOLL depends on it */
1803
	io_tw_lock(req->ctx, locked);
1804
	io_req_complete_failed(req, req->cqe.res);
1805 1806
}

1807
static void io_req_task_submit(struct io_kiocb *req, bool *locked)
1808
{
1809
	io_tw_lock(req->ctx, locked);
1810
	/* req->task == current here, checking PF_EXITING is safe */
1811
	if (likely(!(req->task->flags & PF_EXITING)))
P
Pavel Begunkov 已提交
1812
		io_queue_sqe(req);
1813
	else
1814
		io_req_complete_failed(req, -EFAULT);
1815 1816
}

1817
void io_req_task_queue_fail(struct io_kiocb *req, int ret)
1818
{
1819
	io_req_set_res(req, ret, 0);
1820
	req->io_task_work.func = io_req_task_cancel;
1821
	io_req_task_work_add(req);
1822 1823
}

1824
static void io_req_task_queue(struct io_kiocb *req)
1825
{
1826
	req->io_task_work.func = io_req_task_submit;
1827
	io_req_task_work_add(req);
1828 1829
}

1830 1831
static void io_req_task_queue_reissue(struct io_kiocb *req)
{
1832
	req->io_task_work.func = io_queue_iowq;
1833
	io_req_task_work_add(req);
1834 1835
}

1836
void io_queue_next(struct io_kiocb *req)
1837
{
1838
	struct io_kiocb *nxt = io_req_find_next(req);
1839 1840

	if (nxt)
1841
		io_req_task_queue(nxt);
1842 1843
}

1844
static void io_free_batch_list(struct io_ring_ctx *ctx,
1845
				struct io_wq_work_node *node)
1846
	__must_hold(&ctx->uring_lock)
1847
{
1848
	struct task_struct *task = NULL;
1849
	int task_refs = 0;
1850

1851 1852 1853
	do {
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    comp_list);
1854

1855 1856 1857 1858 1859 1860
		if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
			if (req->flags & REQ_F_REFCOUNT) {
				node = req->comp_list.next;
				if (!req_ref_put_and_test(req))
					continue;
			}
1861 1862 1863 1864 1865 1866 1867 1868 1869
			if ((req->flags & REQ_F_POLLED) && req->apoll) {
				struct async_poll *apoll = req->apoll;

				if (apoll->double_poll)
					kfree(apoll->double_poll);
				list_add(&apoll->poll.wait.entry,
						&ctx->apoll_cache);
				req->flags &= ~REQ_F_POLLED;
			}
1870
			if (req->flags & IO_REQ_LINK_FLAGS)
1871
				io_queue_next(req);
1872 1873
			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
				io_clean_op(req);
1874
		}
1875 1876
		if (!(req->flags & REQ_F_FIXED_FILE))
			io_put_file(req->file);
1877

1878
		io_req_put_rsrc_locked(req, ctx);
1879

1880 1881 1882 1883 1884 1885 1886
		if (req->task != task) {
			if (task)
				io_put_task(task, task_refs);
			task = req->task;
			task_refs = 0;
		}
		task_refs++;
1887
		node = req->comp_list.next;
1888
		io_req_add_to_cache(req, ctx);
1889
	} while (node);
1890 1891 1892

	if (task)
		io_put_task(task, task_refs);
1893 1894
}

1895
static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
1896
	__must_hold(&ctx->uring_lock)
1897
{
1898
	struct io_wq_work_node *node, *prev;
1899
	struct io_submit_state *state = &ctx->submit_state;
1900

1901 1902 1903 1904
	if (state->flush_cqes) {
		spin_lock(&ctx->completion_lock);
		wq_list_for_each(node, prev, &state->compl_reqs) {
			struct io_kiocb *req = container_of(node, struct io_kiocb,
1905
						    comp_list);
1906

1907 1908
			if (!(req->flags & REQ_F_CQE_SKIP))
				__io_fill_cqe_req(ctx, req);
1909 1910 1911 1912 1913 1914
		}

		io_commit_cqring(ctx);
		spin_unlock(&ctx->completion_lock);
		io_cqring_ev_posted(ctx);
		state->flush_cqes = false;
1915
	}
1916

1917
	io_free_batch_list(ctx, state->compl_reqs.first);
1918
	INIT_WQ_LIST(&state->compl_reqs);
1919 1920
}

1921 1922 1923 1924
/*
 * Drop reference to request, return next in chain (if there is one) if this
 * was the last reference to this request.
 */
1925
static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
1926
{
1927 1928
	struct io_kiocb *nxt = NULL;

1929
	if (req_ref_put_and_test(req)) {
1930
		if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
1931
			nxt = io_req_find_next(req);
P
Pavel Begunkov 已提交
1932
		io_free_req(req);
1933
	}
1934
	return nxt;
J
Jens Axboe 已提交
1935 1936
}

1937
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
1938 1939 1940
{
	/* See comment at the top of this file */
	smp_rmb();
1941
	return __io_cqring_events(ctx);
1942 1943
}

1944
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
J
Jens Axboe 已提交
1945
{
1946
	struct io_wq_work_node *pos, *start, *prev;
1947
	unsigned int poll_flags = BLK_POLL_NOSLEEP;
1948
	DEFINE_IO_COMP_BATCH(iob);
1949
	int nr_events = 0;
J
Jens Axboe 已提交
1950 1951 1952

	/*
	 * Only spin for completions if we don't have multiple devices hanging
P
Pavel Begunkov 已提交
1953
	 * off our complete list.
J
Jens Axboe 已提交
1954
	 */
P
Pavel Begunkov 已提交
1955
	if (ctx->poll_multi_queue || force_nonspin)
1956
		poll_flags |= BLK_POLL_ONESHOT;
J
Jens Axboe 已提交
1957

1958 1959
	wq_list_for_each(pos, start, &ctx->iopoll_list) {
		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
1960
		struct io_rw *rw = io_kiocb_to_cmd(req);
1961
		int ret;
J
Jens Axboe 已提交
1962 1963

		/*
1964 1965 1966
		 * Move completed and retryable entries to our local lists.
		 * If we find a request that requires polling, break out
		 * and complete those lists first, if we have entries there.
J
Jens Axboe 已提交
1967
		 */
P
Pavel Begunkov 已提交
1968
		if (READ_ONCE(req->iopoll_completed))
J
Jens Axboe 已提交
1969 1970
			break;

1971
		ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
1972 1973 1974
		if (unlikely(ret < 0))
			return ret;
		else if (ret)
1975
			poll_flags |= BLK_POLL_ONESHOT;
J
Jens Axboe 已提交
1976

1977
		/* iopoll may have completed current req */
1978 1979
		if (!rq_list_empty(iob.req_list) ||
		    READ_ONCE(req->iopoll_completed))
P
Pavel Begunkov 已提交
1980
			break;
J
Jens Axboe 已提交
1981 1982
	}

1983 1984
	if (!rq_list_empty(iob.req_list))
		iob.complete(&iob);
1985 1986
	else if (!pos)
		return 0;
J
Jens Axboe 已提交
1987

1988 1989 1990 1991
	prev = start;
	wq_list_for_each_resume(pos, prev) {
		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);

1992 1993
		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
		if (!smp_load_acquire(&req->iopoll_completed))
P
Pavel Begunkov 已提交
1994
			break;
1995
		nr_events++;
1996 1997
		if (unlikely(req->flags & REQ_F_CQE_SKIP))
			continue;
1998 1999 2000

		req->cqe.flags = io_put_kbuf(req, 0);
		__io_fill_cqe_req(req->ctx, req);
P
Pavel Begunkov 已提交
2001
	}
J
Jens Axboe 已提交
2002

2003 2004 2005 2006 2007
	if (unlikely(!nr_events))
		return 0;

	io_commit_cqring(ctx);
	io_cqring_ev_posted_iopoll(ctx);
2008
	pos = start ? start->next : ctx->iopoll_list.first;
2009
	wq_list_cut(&ctx->iopoll_list, prev, start);
2010
	io_free_batch_list(ctx, pos);
2011
	return nr_events;
J
Jens Axboe 已提交
2012 2013 2014 2015 2016 2017
}

/*
 * We can't just wait for polled events to come to us, we have to actively
 * find and complete them.
 */
P
Pavel Begunkov 已提交
2018
static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2019 2020 2021 2022 2023
{
	if (!(ctx->flags & IORING_SETUP_IOPOLL))
		return;

	mutex_lock(&ctx->uring_lock);
2024
	while (!wq_list_empty(&ctx->iopoll_list)) {
2025
		/* let it sleep and repeat later if can't complete a request */
2026
		if (io_do_iopoll(ctx, true) == 0)
2027
			break;
2028 2029 2030
		/*
		 * Ensure we allow local-to-the-cpu processing to take place,
		 * in this case we need to ensure that we reap all events.
2031
		 * Also let task_work, etc. to progress by releasing the mutex
2032
		 */
2033 2034 2035 2036 2037
		if (need_resched()) {
			mutex_unlock(&ctx->uring_lock);
			cond_resched();
			mutex_lock(&ctx->uring_lock);
		}
J
Jens Axboe 已提交
2038 2039 2040 2041
	}
	mutex_unlock(&ctx->uring_lock);
}

2042
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
J
Jens Axboe 已提交
2043
{
2044
	unsigned int nr_events = 0;
2045
	int ret = 0;
2046
	unsigned long check_cq;
2047

2048 2049 2050 2051 2052
	/*
	 * Don't enter poll loop if we already have events pending.
	 * If we do, we can potentially be spinning for commands that
	 * already triggered a CQE (eg in error).
	 */
2053 2054
	check_cq = READ_ONCE(ctx->check_cq);
	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
2055 2056
		__io_cqring_overflow_flush(ctx, false);
	if (io_cqring_events(ctx))
2057
		return 0;
2058 2059 2060 2061 2062 2063 2064 2065

	/*
	 * Similarly do not spin if we have not informed the user of any
	 * dropped CQE.
	 */
	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
		return -EBADR;

J
Jens Axboe 已提交
2066
	do {
2067 2068 2069 2070 2071 2072 2073 2074 2075 2076
		/*
		 * If a submit got punted to a workqueue, we can have the
		 * application entering polling for a command before it gets
		 * issued. That app will hold the uring_lock for the duration
		 * of the poll right here, so we need to take a breather every
		 * now and then to ensure that the issue has a chance to add
		 * the poll to the issued list. Otherwise we can spin here
		 * forever, while the workqueue is stuck trying to acquire the
		 * very same mutex.
		 */
2077
		if (wq_list_empty(&ctx->iopoll_list)) {
2078 2079
			u32 tail = ctx->cached_cq_tail;

2080
			mutex_unlock(&ctx->uring_lock);
2081
			io_run_task_work();
2082
			mutex_lock(&ctx->uring_lock);
J
Jens Axboe 已提交
2083

2084 2085
			/* some requests don't go through iopoll_list */
			if (tail != ctx->cached_cq_tail ||
2086
			    wq_list_empty(&ctx->iopoll_list))
2087
				break;
2088
		}
2089 2090 2091 2092 2093 2094
		ret = io_do_iopoll(ctx, !min);
		if (ret < 0)
			break;
		nr_events += ret;
		ret = 0;
	} while (nr_events < min && !need_resched());
2095

J
Jens Axboe 已提交
2096 2097 2098
	return ret;
}

2099
static void kiocb_end_write(struct io_kiocb *req)
J
Jens Axboe 已提交
2100
{
2101 2102 2103 2104 2105
	/*
	 * Tell lockdep we inherited freeze protection from submission
	 * thread.
	 */
	if (req->flags & REQ_F_ISREG) {
2106
		struct super_block *sb = file_inode(req->file)->i_sb;
J
Jens Axboe 已提交
2107

2108 2109
		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
		sb_end_write(sb);
J
Jens Axboe 已提交
2110 2111 2112
	}
}

2113
#ifdef CONFIG_BLOCK
2114
static bool io_resubmit_prep(struct io_kiocb *req)
2115
{
2116
	struct io_async_rw *io = req->async_data;
2117

2118
	if (!req_has_async_data(req))
2119
		return !io_req_prep_async(req);
2120
	iov_iter_restore(&io->s.iter, &io->s.iter_state);
2121
	return true;
2122 2123
}

2124
static bool io_rw_should_reissue(struct io_kiocb *req)
2125
{
2126
	umode_t mode = file_inode(req->file)->i_mode;
2127
	struct io_ring_ctx *ctx = req->ctx;
2128

2129 2130
	if (!S_ISBLK(mode) && !S_ISREG(mode))
		return false;
2131 2132
	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
	    !(ctx->flags & IORING_SETUP_IOPOLL)))
2133
		return false;
2134 2135 2136 2137 2138
	/*
	 * If ref is dying, we might be running poll reap from the exit work.
	 * Don't attempt to reissue from that path, just let it fail with
	 * -EAGAIN.
	 */
2139 2140
	if (percpu_ref_is_dying(&ctx->refs))
		return false;
2141 2142 2143 2144 2145 2146
	/*
	 * Play it safe and assume not safe to re-import and reissue if we're
	 * not in the original thread group (or in task context).
	 */
	if (!same_thread_group(req->task, current) || !in_task())
		return false;
2147 2148
	return true;
}
2149
#else
2150
static bool io_resubmit_prep(struct io_kiocb *req)
2151 2152 2153 2154
{
	return false;
}
static bool io_rw_should_reissue(struct io_kiocb *req)
2155
{
2156 2157
	return false;
}
2158
#endif
2159

2160
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2161
{
2162 2163 2164
	struct io_rw *rw = io_kiocb_to_cmd(req);

	if (rw->kiocb.ki_flags & IOCB_WRITE) {
2165
		kiocb_end_write(req);
2166 2167 2168 2169
		fsnotify_modify(req->file);
	} else {
		fsnotify_access(req->file);
	}
2170
	if (unlikely(res != req->cqe.res)) {
2171 2172
		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
		    io_rw_should_reissue(req)) {
2173
			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
2174
			return true;
2175
		}
2176
		req_set_fail(req);
2177
		req->cqe.res = res;
2178
	}
2179 2180 2181
	return false;
}

2182
inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
2183
{
2184
	if (*locked) {
2185
		req->cqe.flags |= io_put_kbuf(req, 0);
2186
		req->flags |= REQ_F_COMPLETE_INLINE;
2187
		io_req_add_compl_list(req);
2188
	} else {
2189 2190
		req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED);
		io_req_complete_post(req);
2191
	}
2192 2193
}

2194
static void __io_complete_rw(struct io_kiocb *req, long res,
2195 2196 2197 2198
			     unsigned int issue_flags)
{
	if (__io_complete_rw_common(req, res))
		return;
2199 2200
	io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags));
	__io_req_complete(req, issue_flags);
2201 2202
}

2203
static void io_complete_rw(struct kiocb *kiocb, long res)
2204
{
2205 2206
	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
	struct io_kiocb *req = cmd_to_io_kiocb(rw);
2207

2208 2209
	if (__io_complete_rw_common(req, res))
		return;
2210
	io_req_set_res(req, res, 0);
2211
	req->io_task_work.func = io_req_task_complete;
2212
	io_req_task_prio_work_add(req);
J
Jens Axboe 已提交
2213 2214
}

2215
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
J
Jens Axboe 已提交
2216
{
2217 2218
	struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
	struct io_kiocb *req = cmd_to_io_kiocb(rw);
J
Jens Axboe 已提交
2219

2220 2221
	if (kiocb->ki_flags & IOCB_WRITE)
		kiocb_end_write(req);
2222
	if (unlikely(res != req->cqe.res)) {
2223
		if (res == -EAGAIN && io_rw_should_reissue(req)) {
2224
			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
2225
			return;
2226
		}
2227
		req->cqe.res = res;
2228
	}
2229

2230 2231
	/* order with io_iopoll_complete() checking ->iopoll_completed */
	smp_store_release(&req->iopoll_completed, 1);
J
Jens Axboe 已提交
2232 2233 2234 2235 2236
}

/*
 * After the iocb has been issued, it's safe to be found on the poll list.
 * Adding the kiocb to the list AFTER submission ensures that we don't
2237
 * find it from a io_do_iopoll() thread before the issuer is done
J
Jens Axboe 已提交
2238 2239
 * accessing the kiocb cookie.
 */
2240
static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
J
Jens Axboe 已提交
2241 2242
{
	struct io_ring_ctx *ctx = req->ctx;
H
Hao Xu 已提交
2243
	const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
2244 2245

	/* workqueue context doesn't hold uring_lock, grab it now */
H
Hao Xu 已提交
2246
	if (unlikely(needs_lock))
2247
		mutex_lock(&ctx->uring_lock);
J
Jens Axboe 已提交
2248 2249 2250 2251 2252 2253

	/*
	 * Track whether we have multiple files in our lists. This will impact
	 * how we do polling eventually, not spinning if we're on potentially
	 * different devices.
	 */
2254
	if (wq_list_empty(&ctx->iopoll_list)) {
2255 2256
		ctx->poll_multi_queue = false;
	} else if (!ctx->poll_multi_queue) {
J
Jens Axboe 已提交
2257 2258
		struct io_kiocb *list_req;

2259 2260
		list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
					comp_list);
2261
		if (list_req->file != req->file)
2262
			ctx->poll_multi_queue = true;
J
Jens Axboe 已提交
2263 2264 2265 2266 2267 2268
	}

	/*
	 * For fast devices, IO may have already completed. If it has, add
	 * it to the front so we find it first.
	 */
2269
	if (READ_ONCE(req->iopoll_completed))
2270
		wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
J
Jens Axboe 已提交
2271
	else
2272
		wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
2273

H
Hao Xu 已提交
2274
	if (unlikely(needs_lock)) {
2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286
		/*
		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
		 * in sq thread task context or in io worker task context. If
		 * current task context is sq thread, we don't need to check
		 * whether should wake up sq thread.
		 */
		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
		    wq_has_sleeper(&ctx->sq_data->wait))
			wake_up(&ctx->sq_data->wait);

		mutex_unlock(&ctx->uring_lock);
	}
J
Jens Axboe 已提交
2287 2288
}

2289 2290
static bool io_bdev_nowait(struct block_device *bdev)
{
2291
	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2292 2293
}

J
Jens Axboe 已提交
2294 2295 2296 2297 2298
/*
 * If we tracked the file through the SCM inflight mechanism, we could support
 * any file. For now, just ensure that anything potentially problematic is done
 * inline.
 */
2299
static bool __io_file_supports_nowait(struct file *file, umode_t mode)
J
Jens Axboe 已提交
2300
{
2301
	if (S_ISBLK(mode)) {
C
Christoph Hellwig 已提交
2302 2303
		if (IS_ENABLED(CONFIG_BLOCK) &&
		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2304 2305 2306
			return true;
		return false;
	}
2307
	if (S_ISSOCK(mode))
J
Jens Axboe 已提交
2308
		return true;
2309
	if (S_ISREG(mode)) {
C
Christoph Hellwig 已提交
2310 2311
		if (IS_ENABLED(CONFIG_BLOCK) &&
		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2312
		    !io_is_uring_fops(file))
2313 2314 2315
			return true;
		return false;
	}
J
Jens Axboe 已提交
2316

2317 2318 2319
	/* any ->read/write should understand O_NONBLOCK */
	if (file->f_flags & O_NONBLOCK)
		return true;
2320
	return file->f_mode & FMODE_NOWAIT;
J
Jens Axboe 已提交
2321
}
2322

2323 2324 2325 2326 2327 2328 2329 2330 2331
/*
 * If we tracked the file through the SCM inflight mechanism, we could support
 * any file. For now, just ensure that anything potentially problematic is done
 * inline.
 */
static unsigned int io_file_get_flags(struct file *file)
{
	umode_t mode = file_inode(file)->i_mode;
	unsigned int res = 0;
2332

2333 2334 2335 2336
	if (S_ISREG(mode))
		res |= FFS_ISREG;
	if (__io_file_supports_nowait(file, mode))
		res |= FFS_NOWAIT;
2337 2338
	if (io_file_need_scm(file))
		res |= FFS_SCM;
2339
	return res;
J
Jens Axboe 已提交
2340 2341
}

2342
static inline bool io_file_supports_nowait(struct io_kiocb *req)
2343
{
2344
	return req->flags & REQ_F_SUPPORT_NOWAIT;
2345 2346
}

2347
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
J
Jens Axboe 已提交
2348
{
2349
	struct io_rw *rw = io_kiocb_to_cmd(req);
J
Jens Axboe 已提交
2350 2351
	unsigned ioprio;
	int ret;
J
Jens Axboe 已提交
2352

2353
	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367
	/* used for fixed read/write too - just read unconditionally */
	req->buf_index = READ_ONCE(sqe->buf_index);

	if (req->opcode == IORING_OP_READ_FIXED ||
	    req->opcode == IORING_OP_WRITE_FIXED) {
		struct io_ring_ctx *ctx = req->ctx;
		u16 index;

		if (unlikely(req->buf_index >= ctx->nr_user_bufs))
			return -EFAULT;
		index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
		req->imu = ctx->user_bufs[index];
		io_req_set_rsrc_node(req, ctx, 0);
	}
2368

2369 2370 2371 2372 2373 2374
	ioprio = READ_ONCE(sqe->ioprio);
	if (ioprio) {
		ret = ioprio_check_cap(ioprio);
		if (ret)
			return ret;

2375
		rw->kiocb.ki_ioprio = ioprio;
2376
	} else {
2377
		rw->kiocb.ki_ioprio = get_current_ioprio();
2378 2379
	}

2380 2381 2382
	rw->addr = READ_ONCE(sqe->addr);
	rw->len = READ_ONCE(sqe->len);
	rw->flags = READ_ONCE(sqe->rw_flags);
J
Jens Axboe 已提交
2383 2384 2385
	return 0;
}

2386 2387 2388 2389 2390 2391 2392
static void io_readv_writev_cleanup(struct io_kiocb *req)
{
	struct io_async_rw *io = req->async_data;

	kfree(io->free_iovec);
}

J
Jens Axboe 已提交
2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407
static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
{
	switch (ret) {
	case -EIOCBQUEUED:
		break;
	case -ERESTARTSYS:
	case -ERESTARTNOINTR:
	case -ERESTARTNOHAND:
	case -ERESTART_RESTARTBLOCK:
		/*
		 * We can't just restart the syscall, since previously
		 * submitted sqes may already be in progress. Just fail this
		 * IO with EINTR.
		 */
		ret = -EINTR;
2408
		fallthrough;
J
Jens Axboe 已提交
2409
	default:
2410
		kiocb->ki_complete(kiocb, ret);
J
Jens Axboe 已提交
2411 2412 2413
	}
}

2414
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
2415
{
2416
	struct io_rw *rw = io_kiocb_to_cmd(req);
2417

2418 2419
	if (rw->kiocb.ki_pos != -1)
		return &rw->kiocb.ki_pos;
2420 2421 2422

	if (!(req->file->f_mode & FMODE_STREAM)) {
		req->flags |= REQ_F_CUR_POS;
2423 2424
		rw->kiocb.ki_pos = req->file->f_pos;
		return &rw->kiocb.ki_pos;
2425
	}
2426

2427
	rw->kiocb.ki_pos = 0;
2428
	return NULL;
2429 2430
}

2431
static void kiocb_done(struct io_kiocb *req, ssize_t ret,
2432
		       unsigned int issue_flags)
2433
{
2434
	struct io_async_rw *io = req->async_data;
2435
	struct io_rw *rw = io_kiocb_to_cmd(req);
2436

2437
	/* add previously done IO, if any */
2438
	if (req_has_async_data(req) && io->bytes_done > 0) {
2439
		if (ret < 0)
2440
			ret = io->bytes_done;
2441
		else
2442
			ret += io->bytes_done;
2443 2444
	}

2445
	if (req->flags & REQ_F_CUR_POS)
2446 2447
		req->file->f_pos = rw->kiocb.ki_pos;
	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw))
2448
		__io_complete_rw(req, ret, issue_flags);
2449
	else
2450
		io_rw_done(&rw->kiocb, ret);
P
Pavel Begunkov 已提交
2451

2452
	if (req->flags & REQ_F_REISSUE) {
P
Pavel Begunkov 已提交
2453
		req->flags &= ~REQ_F_REISSUE;
2454
		if (io_resubmit_prep(req))
2455
			io_req_task_queue_reissue(req);
2456 2457
		else
			io_req_task_queue_fail(req, ret);
P
Pavel Begunkov 已提交
2458
	}
2459 2460
}

2461 2462
static int __io_import_fixed(struct io_kiocb *req, int ddir,
			     struct iov_iter *iter, struct io_mapped_ubuf *imu)
2463
{
2464 2465 2466
	struct io_rw *rw = io_kiocb_to_cmd(req);
	size_t len = rw->len;
	u64 buf_end, buf_addr = rw->addr;
2467 2468
	size_t offset;

2469
	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
2470 2471
		return -EFAULT;
	/* not inside the mapped region */
2472
	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
2473 2474 2475 2476 2477 2478 2479
		return -EFAULT;

	/*
	 * May not be a start of buffer, set size appropriately
	 * and advance us to the beginning.
	 */
	offset = buf_addr - imu->ubuf;
2480
	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511

	if (offset) {
		/*
		 * Don't use iov_iter_advance() here, as it's really slow for
		 * using the latter parts of a big fixed buffer - it iterates
		 * over each segment manually. We can cheat a bit here, because
		 * we know that:
		 *
		 * 1) it's a BVEC iter, we set it up
		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
		 *    first and last bvec
		 *
		 * So just find our index, and adjust the iterator afterwards.
		 * If the offset is within the first bvec (or the whole first
		 * bvec, just use iov_iter_advance(). This makes it easier
		 * since we can just skip the first segment, which may not
		 * be PAGE_SIZE aligned.
		 */
		const struct bio_vec *bvec = imu->bvec;

		if (offset <= bvec->bv_len) {
			iov_iter_advance(iter, offset);
		} else {
			unsigned long seg_skip;

			/* skip first vec */
			offset -= bvec->bv_len;
			seg_skip = 1 + (offset >> PAGE_SHIFT);

			iter->bvec = bvec + seg_skip;
			iter->nr_segs -= seg_skip;
2512
			iter->count -= bvec->bv_len + offset;
2513 2514 2515 2516
			iter->iov_offset = offset & ~PAGE_MASK;
		}
	}

2517
	return 0;
2518 2519
}

2520 2521
static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
			   unsigned int issue_flags)
2522
{
2523 2524 2525
	if (WARN_ON_ONCE(!req->imu))
		return -EFAULT;
	return __io_import_fixed(req, rw, iter, req->imu);
2526 2527
}

2528 2529
static int io_buffer_add_list(struct io_ring_ctx *ctx,
			      struct io_buffer_list *bl, unsigned int bgid)
2530
{
2531
	bl->bgid = bgid;
2532 2533 2534 2535
	if (bgid < BGID_ARRAY)
		return 0;

	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
2536 2537
}

2538
static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
2539
					      struct io_buffer_list *bl)
2540
{
2541 2542
	if (!list_empty(&bl->buf_list)) {
		struct io_buffer *kbuf;
2543

2544 2545
		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
		list_del(&kbuf->list);
2546 2547
		if (*len > kbuf->len)
			*len = kbuf->len;
P
Pavel Begunkov 已提交
2548 2549
		req->flags |= REQ_F_BUFFER_SELECTED;
		req->kbuf = kbuf;
2550
		req->buf_index = kbuf->bid;
2551
		return u64_to_user_ptr(kbuf->addr);
2552
	}
2553
	return NULL;
2554 2555
}

2556 2557 2558 2559 2560 2561
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
					  struct io_buffer_list *bl,
					  unsigned int issue_flags)
{
	struct io_uring_buf_ring *br = bl->buf_ring;
	struct io_uring_buf *buf;
2562
	__u16 head = bl->head;
2563

2564
	if (unlikely(smp_load_acquire(&br->tail) == head))
2565
		return NULL;
2566 2567 2568 2569

	head &= bl->mask;
	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
		buf = &br->bufs[head];
2570
	} else {
2571
		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
D
Dylan Yudaken 已提交
2572
		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
2573 2574
		buf = page_address(bl->buf_pages[index]);
		buf += off;
2575
	}
2576 2577 2578 2579 2580
	if (*len > buf->len)
		*len = buf->len;
	req->flags |= REQ_F_BUFFER_RING;
	req->buf_list = bl;
	req->buf_index = buf->bid;
2581

2582
	if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594
		/*
		 * If we came in unlocked, we have no choice but to consume the
		 * buffer here. This does mean it'll be pinned until the IO
		 * completes. But coming in unlocked means we're in io-wq
		 * context, hence there should be no further retry. For the
		 * locked case, the caller must ensure to call the commit when
		 * the transfer completes (or if we get -EAGAIN and must poll
		 * or retry).
		 */
		req->buf_list = NULL;
		bl->head++;
	}
2595
	return u64_to_user_ptr(buf->addr);
2596 2597
}

2598 2599
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
			      unsigned int issue_flags)
2600
{
2601 2602
	struct io_ring_ctx *ctx = req->ctx;
	struct io_buffer_list *bl;
2603
	void __user *ret = NULL;
2604

2605
	io_ring_submit_lock(req->ctx, issue_flags);
2606

2607
	bl = io_buffer_get_list(ctx, req->buf_index);
2608 2609 2610 2611 2612
	if (likely(bl)) {
		if (bl->buf_nr_pages)
			ret = io_ring_buffer_select(req, len, bl, issue_flags);
		else
			ret = io_provided_buffer_select(req, len, bl);
2613
	}
2614 2615
	io_ring_submit_unlock(req->ctx, issue_flags);
	return ret;
2616 2617 2618 2619
}

#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2620
				unsigned int issue_flags)
2621
{
2622
	struct io_rw *rw = io_kiocb_to_cmd(req);
2623 2624 2625
	struct compat_iovec __user *uiov;
	compat_ssize_t clen;
	void __user *buf;
2626
	size_t len;
2627

2628
	uiov = u64_to_user_ptr(rw->addr);
2629 2630 2631 2632 2633 2634 2635 2636
	if (!access_ok(uiov, sizeof(*uiov)))
		return -EFAULT;
	if (__get_user(clen, &uiov->iov_len))
		return -EFAULT;
	if (clen < 0)
		return -EINVAL;

	len = clen;
2637
	buf = io_buffer_select(req, &len, issue_flags);
2638 2639
	if (!buf)
		return -ENOBUFS;
2640
	rw->addr = (unsigned long) buf;
2641
	iov[0].iov_base = buf;
2642
	rw->len = iov[0].iov_len = (compat_size_t) len;
2643 2644 2645 2646 2647
	return 0;
}
#endif

static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2648
				      unsigned int issue_flags)
2649
{
2650 2651
	struct io_rw *rw = io_kiocb_to_cmd(req);
	struct iovec __user *uiov = u64_to_user_ptr(rw->addr);
2652 2653 2654 2655 2656 2657 2658 2659 2660
	void __user *buf;
	ssize_t len;

	if (copy_from_user(iov, uiov, sizeof(*uiov)))
		return -EFAULT;

	len = iov[0].iov_len;
	if (len < 0)
		return -EINVAL;
2661
	buf = io_buffer_select(req, &len, issue_flags);
2662 2663
	if (!buf)
		return -ENOBUFS;
2664
	rw->addr = (unsigned long) buf;
2665
	iov[0].iov_base = buf;
2666
	rw->len = iov[0].iov_len = len;
2667 2668 2669 2670
	return 0;
}

static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2671
				    unsigned int issue_flags)
2672
{
2673 2674
	struct io_rw *rw = io_kiocb_to_cmd(req);

2675
	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
2676 2677
		iov[0].iov_base = u64_to_user_ptr(rw->addr);
		iov[0].iov_len = rw->len;
2678
		return 0;
2679
	}
2680
	if (rw->len != 1)
2681 2682 2683 2684
		return -EINVAL;

#ifdef CONFIG_COMPAT
	if (req->ctx->compat)
2685
		return io_compat_import(req, iov, issue_flags);
2686 2687
#endif

2688
	return __io_iov_buffer_select(req, iov, issue_flags);
2689 2690
}

2691
static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
2692 2693
				       struct io_rw_state *s,
				       unsigned int issue_flags)
J
Jens Axboe 已提交
2694
{
2695
	struct io_rw *rw = io_kiocb_to_cmd(req);
2696
	struct iov_iter *iter = &s->iter;
2697
	u8 opcode = req->opcode;
2698
	struct iovec *iovec;
2699 2700
	void __user *buf;
	size_t sqe_len;
2701
	ssize_t ret;
2702

2703
	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2704
		ret = io_import_fixed(req, ddir, iter, issue_flags);
2705 2706 2707 2708
		if (ret)
			return ERR_PTR(ret);
		return NULL;
	}
J
Jens Axboe 已提交
2709

2710 2711
	buf = u64_to_user_ptr(rw->addr);
	sqe_len = rw->len;
2712

2713
	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2714
		if (io_do_buffer_select(req)) {
2715
			buf = io_buffer_select(req, &sqe_len, issue_flags);
2716 2717
			if (!buf)
				return ERR_PTR(-ENOBUFS);
2718 2719
			rw->addr = (unsigned long) buf;
			rw->len = sqe_len;
2720 2721
		}

2722
		ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter);
2723 2724 2725
		if (ret)
			return ERR_PTR(ret);
		return NULL;
2726 2727
	}

2728
	iovec = s->fast_iov;
2729
	if (req->flags & REQ_F_BUFFER_SELECT) {
2730
		ret = io_iov_buffer_select(req, iovec, issue_flags);
2731 2732
		if (ret)
			return ERR_PTR(ret);
2733
		iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len);
2734
		return NULL;
2735 2736
	}

2737
	ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
2738
			      req->ctx->compat);
2739 2740 2741
	if (unlikely(ret < 0))
		return ERR_PTR(ret);
	return iovec;
J
Jens Axboe 已提交
2742 2743
}

2744 2745 2746 2747
static inline int io_import_iovec(int rw, struct io_kiocb *req,
				  struct iovec **iovec, struct io_rw_state *s,
				  unsigned int issue_flags)
{
2748 2749 2750
	*iovec = __io_import_iovec(rw, req, s, issue_flags);
	if (unlikely(IS_ERR(*iovec)))
		return PTR_ERR(*iovec);
2751 2752

	iov_iter_save_state(&s->iter, &s->iter_state);
2753
	return 0;
J
Jens Axboe 已提交
2754 2755
}

2756 2757
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
2758
	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
2759 2760
}

2761
/*
2762 2763
 * For files that don't have ->read_iter() and ->write_iter(), handle them
 * by looping over ->read() or ->write() manually.
2764
 */
2765
static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
2766
{
2767 2768
	struct kiocb *kiocb = &rw->kiocb;
	struct file *file = kiocb->ki_filp;
2769
	ssize_t ret = 0;
2770
	loff_t *ppos;
2771 2772 2773 2774 2775 2776 2777 2778

	/*
	 * Don't support polled IO through this interface, and we can't
	 * support non-blocking either. For the latter, this just causes
	 * the kiocb to be handled from an async context.
	 */
	if (kiocb->ki_flags & IOCB_HIPRI)
		return -EOPNOTSUPP;
2779 2780
	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
	    !(kiocb->ki_filp->f_flags & O_NONBLOCK))
2781 2782
		return -EAGAIN;

2783 2784
	ppos = io_kiocb_ppos(kiocb);

2785
	while (iov_iter_count(iter)) {
2786
		struct iovec iovec;
2787 2788
		ssize_t nr;

2789 2790 2791
		if (!iov_iter_is_bvec(iter)) {
			iovec = iov_iter_iovec(iter);
		} else {
2792 2793
			iovec.iov_base = u64_to_user_ptr(rw->addr);
			iovec.iov_len = rw->len;
2794 2795
		}

2796
		if (ddir == READ) {
2797
			nr = file->f_op->read(file, iovec.iov_base,
2798
					      iovec.iov_len, ppos);
2799 2800
		} else {
			nr = file->f_op->write(file, iovec.iov_base,
2801
					       iovec.iov_len, ppos);
2802 2803 2804 2805 2806 2807 2808
		}

		if (nr < 0) {
			if (!ret)
				ret = nr;
			break;
		}
2809
		ret += nr;
2810 2811 2812
		if (!iov_iter_is_bvec(iter)) {
			iov_iter_advance(iter, nr);
		} else {
2813 2814 2815
			rw->addr += nr;
			rw->len -= nr;
			if (!rw->len)
2816
				break;
2817
		}
2818 2819 2820 2821 2822 2823 2824
		if (nr != iovec.iov_len)
			break;
	}

	return ret;
}

2825 2826
static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
			  const struct iovec *fast_iov, struct iov_iter *iter)
2827
{
2828
	struct io_async_rw *io = req->async_data;
2829

2830 2831 2832
	memcpy(&io->s.iter, iter, sizeof(*iter));
	io->free_iovec = iovec;
	io->bytes_done = 0;
2833
	/* can only be fixed buffers, no need to do anything */
P
Pavel Begunkov 已提交
2834
	if (iov_iter_is_bvec(iter))
2835
		return;
2836
	if (!iovec) {
2837 2838
		unsigned iov_off = 0;

2839
		io->s.iter.iov = io->s.fast_iov;
2840 2841
		if (iter->iov != fast_iov) {
			iov_off = iter->iov - fast_iov;
2842
			io->s.iter.iov += iov_off;
2843
		}
2844 2845
		if (io->s.fast_iov != fast_iov)
			memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
2846
			       sizeof(struct iovec) * iter->nr_segs);
P
Pavel Begunkov 已提交
2847 2848
	} else {
		req->flags |= REQ_F_NEED_CLEANUP;
2849 2850 2851
	}
}

2852
bool io_alloc_async_data(struct io_kiocb *req)
2853
{
2854 2855
	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
2856 2857 2858 2859 2860
	if (req->async_data) {
		req->flags |= REQ_F_ASYNC_DATA;
		return false;
	}
	return true;
2861 2862
}

2863
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
2864
			     struct io_rw_state *s, bool force)
2865
{
2866
	if (!force && !io_op_defs[req->opcode].prep_async)
2867
		return 0;
2868
	if (!req_has_async_data(req)) {
2869 2870
		struct io_async_rw *iorw;

2871
		if (io_alloc_async_data(req)) {
2872
			kfree(iovec);
2873
			return -ENOMEM;
2874
		}
2875

2876
		io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
2877 2878
		iorw = req->async_data;
		/* we've copied and mapped the iter, ensure state is saved */
P
Pavel Begunkov 已提交
2879
		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
2880
	}
2881
	return 0;
2882 2883
}

2884
static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
2885
{
2886
	struct io_async_rw *iorw = req->async_data;
2887
	struct iovec *iov;
2888
	int ret;
2889

2890
	/* submission path, ->uring_lock should already be taken */
H
Hao Xu 已提交
2891
	ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
2892 2893 2894
	if (unlikely(ret < 0))
		return ret;

2895 2896 2897 2898
	iorw->bytes_done = 0;
	iorw->free_iovec = iov;
	if (iov)
		req->flags |= REQ_F_NEED_CLEANUP;
2899 2900 2901
	return 0;
}

2902 2903 2904 2905 2906 2907 2908 2909 2910 2911
static int io_readv_prep_async(struct io_kiocb *req)
{
	return io_rw_prep_async(req, READ);
}

static int io_writev_prep_async(struct io_kiocb *req)
{
	return io_rw_prep_async(req, WRITE);
}

2912
/*
2913
 * This is our waitqueue callback handler, registered through __folio_lock_async()
2914 2915 2916 2917 2918 2919 2920 2921
 * when we initially tried to do the IO with the iocb armed our waitqueue.
 * This gets called when the page is unlocked, and we generally expect that to
 * happen when the page IO is completed and the page is now uptodate. This will
 * queue a task_work based retry of the operation, attempting to copy the data
 * again. If the latter fails because the page was NOT uptodate, then we will
 * do a thread based blocking retry of the operation. That's the unexpected
 * slow path.
 */
2922 2923 2924 2925 2926
static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
			     int sync, void *arg)
{
	struct wait_page_queue *wpq;
	struct io_kiocb *req = wait->private;
2927
	struct io_rw *rw = io_kiocb_to_cmd(req);
2928 2929 2930 2931
	struct wait_page_key *key = arg;

	wpq = container_of(wait, struct wait_page_queue, wait);

2932 2933 2934
	if (!wake_page_match(wpq, key))
		return 0;

2935
	rw->kiocb.ki_flags &= ~IOCB_WAITQ;
2936
	list_del_init(&wait->entry);
2937
	io_req_task_queue(req);
2938 2939 2940
	return 1;
}

2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952
/*
 * This controls whether a given IO request should be armed for async page
 * based retry. If we return false here, the request is handed to the async
 * worker threads for retry. If we're doing buffered reads on a regular file,
 * we prepare a private wait_page_queue entry and retry the operation. This
 * will either succeed because the page is now uptodate and unlocked, or it
 * will register a callback when the page is unlocked at IO completion. Through
 * that callback, io_uring uses task_work to setup a retry of the operation.
 * That retry will attempt the buffered read again. The retry will generally
 * succeed, or in rare cases where it fails, we then fall back to using the
 * async worker threads for a blocking retry.
 */
2953
static bool io_rw_should_retry(struct io_kiocb *req)
2954
{
2955 2956 2957 2958
	struct io_async_rw *io = req->async_data;
	struct wait_page_queue *wait = &io->wpq;
	struct io_rw *rw = io_kiocb_to_cmd(req);
	struct kiocb *kiocb = &rw->kiocb;
2959

2960 2961 2962
	/* never retry for NOWAIT, we just complete with -EAGAIN */
	if (req->flags & REQ_F_NOWAIT)
		return false;
2963

2964
	/* Only for buffered IO */
2965
	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
2966
		return false;
2967

2968 2969 2970 2971 2972 2973
	/*
	 * just use poll if we can, and don't attempt if the fs doesn't
	 * support callback based unlocks
	 */
	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
		return false;
2974

2975 2976 2977 2978 2979
	wait->wait.func = io_async_buf_func;
	wait->wait.private = req;
	wait->wait.flags = 0;
	INIT_LIST_HEAD(&wait->wait.entry);
	kiocb->ki_flags |= IOCB_WAITQ;
2980
	kiocb->ki_flags &= ~IOCB_NOWAIT;
2981 2982
	kiocb->ki_waitq = wait;
	return true;
2983 2984
}

2985
static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
2986
{
2987 2988 2989 2990 2991 2992
	struct file *file = rw->kiocb.ki_filp;

	if (likely(file->f_op->read_iter))
		return call_read_iter(file, &rw->kiocb, iter);
	else if (file->f_op->read)
		return loop_rw_iter(READ, rw, iter);
2993 2994
	else
		return -EINVAL;
2995 2996
}

2997 2998 2999 3000 3001 3002
static bool need_read_all(struct io_kiocb *req)
{
	return req->flags & REQ_F_ISREG ||
		S_ISBLK(file_inode(req->file)->i_mode);
}

3003 3004
static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
{
3005 3006
	struct io_rw *rw = io_kiocb_to_cmd(req);
	struct kiocb *kiocb = &rw->kiocb;
3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017
	struct io_ring_ctx *ctx = req->ctx;
	struct file *file = req->file;
	int ret;

	if (unlikely(!file || !(file->f_mode & mode)))
		return -EBADF;

	if (!io_req_ffs_set(req))
		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;

	kiocb->ki_flags = iocb_flags(file);
3018
	ret = kiocb_set_rw_flags(kiocb, rw->flags);
3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034
	if (unlikely(ret))
		return ret;

	/*
	 * If the file is marked O_NONBLOCK, still allow retry for it if it
	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
	 */
	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
		req->flags |= REQ_F_NOWAIT;

	if (ctx->flags & IORING_SETUP_IOPOLL) {
		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
			return -EOPNOTSUPP;

3035
		kiocb->private = NULL;
3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047
		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
		kiocb->ki_complete = io_complete_rw_iopoll;
		req->iopoll_completed = 0;
	} else {
		if (kiocb->ki_flags & IOCB_HIPRI)
			return -EINVAL;
		kiocb->ki_complete = io_complete_rw;
	}

	return 0;
}

3048
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
J
Jens Axboe 已提交
3049
{
3050
	struct io_rw *rw = io_kiocb_to_cmd(req);
3051
	struct io_rw_state __s, *s = &__s;
3052
	struct iovec *iovec;
3053
	struct kiocb *kiocb = &rw->kiocb;
3054
	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3055
	struct io_async_rw *io;
3056
	ssize_t ret, ret2;
3057
	loff_t *ppos;
3058

3059 3060 3061 3062 3063
	if (!req_has_async_data(req)) {
		ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
		if (unlikely(ret < 0))
			return ret;
	} else {
3064 3065
		io = req->async_data;
		s = &io->s;
3066

3067 3068 3069 3070
		/*
		 * Safe and required to re-import if we're using provided
		 * buffers, as we dropped the selected one before retry.
		 */
3071
		if (io_do_buffer_select(req)) {
3072 3073 3074 3075 3076
			ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
			if (unlikely(ret < 0))
				return ret;
		}

3077 3078 3079 3080 3081
		/*
		 * We come here from an earlier attempt, restore our state to
		 * match in case it doesn't. It's cheap enough that we don't
		 * need to make this conditional.
		 */
3082
		iov_iter_restore(&s->iter, &s->iter_state);
3083 3084
		iovec = NULL;
	}
3085
	ret = io_rw_init_file(req, FMODE_READ);
3086 3087
	if (unlikely(ret)) {
		kfree(iovec);
3088
		return ret;
3089
	}
3090
	req->cqe.res = iov_iter_count(&s->iter);
J
Jens Axboe 已提交
3091

3092 3093
	if (force_nonblock) {
		/* If the file doesn't support async, just async punt */
3094
		if (unlikely(!io_file_supports_nowait(req))) {
3095 3096 3097
			ret = io_setup_async_rw(req, iovec, s, true);
			return ret ?: -EAGAIN;
		}
3098
		kiocb->ki_flags |= IOCB_NOWAIT;
3099 3100 3101
	} else {
		/* Ensure we clear previously set non-block flag */
		kiocb->ki_flags &= ~IOCB_NOWAIT;
3102
	}
J
Jens Axboe 已提交
3103

3104
	ppos = io_kiocb_update_pos(req);
3105

3106
	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
3107 3108 3109 3110
	if (unlikely(ret)) {
		kfree(iovec);
		return ret;
	}
J
Jens Axboe 已提交
3111

3112
	ret = io_iter_do_read(rw, &s->iter);
3113

3114
	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3115
		req->flags &= ~REQ_F_REISSUE;
3116 3117 3118
		/* if we can poll, just do that */
		if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
			return -EAGAIN;
3119 3120
		/* IOPOLL retry should happen for io-wq threads */
		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3121
			goto done;
3122 3123
		/* no retry on NONBLOCK nor RWF_NOWAIT */
		if (req->flags & REQ_F_NOWAIT)
3124
			goto done;
3125
		ret = 0;
3126 3127
	} else if (ret == -EIOCBQUEUED) {
		goto out_free;
3128
	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
3129
		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
3130
		/* read all, failed, already did sync or don't want to retry */
3131
		goto done;
3132 3133
	}

3134 3135 3136 3137 3138
	/*
	 * Don't depend on the iter state matching what was consumed, or being
	 * untouched in case of error. Restore it and we'll advance it
	 * manually if we need to.
	 */
3139
	iov_iter_restore(&s->iter, &s->iter_state);
3140

3141
	ret2 = io_setup_async_rw(req, iovec, s, true);
3142 3143 3144
	if (ret2)
		return ret2;

P
Pavel Begunkov 已提交
3145
	iovec = NULL;
3146 3147
	io = req->async_data;
	s = &io->s;
3148 3149 3150 3151
	/*
	 * Now use our persistent iterator and state, if we aren't already.
	 * We've restored and mapped the iter to match.
	 */
3152

3153
	do {
3154 3155 3156 3157 3158
		/*
		 * We end up here because of a partial read, either from
		 * above or inside this loop. Advance the iter by the bytes
		 * that were consumed.
		 */
3159 3160
		iov_iter_advance(&s->iter, ret);
		if (!iov_iter_count(&s->iter))
3161
			break;
3162
		io->bytes_done += ret;
3163
		iov_iter_save_state(&s->iter, &s->iter_state);
3164

3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176
		/* if we can retry, do so with the callbacks armed */
		if (!io_rw_should_retry(req)) {
			kiocb->ki_flags &= ~IOCB_WAITQ;
			return -EAGAIN;
		}

		/*
		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
		 * we get -EIOCBQUEUED, then we'll get a notification when the
		 * desired page gets unlocked. We can also get a partial read
		 * here, and if we do, then just retry at the new offset.
		 */
3177
		ret = io_iter_do_read(rw, &s->iter);
3178
		if (ret == -EIOCBQUEUED)
3179
			return IOU_ISSUE_SKIP_COMPLETE;
3180
		/* we got some bytes, but not all. retry. */
3181
		kiocb->ki_flags &= ~IOCB_WAITQ;
3182
		iov_iter_restore(&s->iter, &s->iter_state);
3183
	} while (ret > 0);
3184
done:
3185
	kiocb_done(req, ret, issue_flags);
P
Pavel Begunkov 已提交
3186 3187 3188 3189
out_free:
	/* it's faster to check here then delegate to kfree */
	if (iovec)
		kfree(iovec);
3190
	return IOU_ISSUE_SKIP_COMPLETE;
J
Jens Axboe 已提交
3191 3192
}

3193
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
J
Jens Axboe 已提交
3194
{
3195
	struct io_rw *rw = io_kiocb_to_cmd(req);
3196
	struct io_rw_state __s, *s = &__s;
3197
	struct iovec *iovec;
3198
	struct kiocb *kiocb = &rw->kiocb;
3199
	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3200
	ssize_t ret, ret2;
3201
	loff_t *ppos;
J
Jens Axboe 已提交
3202

3203
	if (!req_has_async_data(req)) {
3204 3205
		ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
		if (unlikely(ret < 0))
3206
			return ret;
3207
	} else {
3208
		struct io_async_rw *io = req->async_data;
3209

3210
		s = &io->s;
3211
		iov_iter_restore(&s->iter, &s->iter_state);
3212 3213
		iovec = NULL;
	}
3214
	ret = io_rw_init_file(req, FMODE_WRITE);
3215 3216
	if (unlikely(ret)) {
		kfree(iovec);
3217
		return ret;
3218
	}
3219
	req->cqe.res = iov_iter_count(&s->iter);
J
Jens Axboe 已提交
3220

3221 3222
	if (force_nonblock) {
		/* If the file doesn't support async, just async punt */
3223
		if (unlikely(!io_file_supports_nowait(req)))
3224
			goto copy_iov;
3225

3226 3227 3228 3229
		/* file path doesn't support NOWAIT for non-direct_IO */
		if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
		    (req->flags & REQ_F_ISREG))
			goto copy_iov;
3230

3231 3232 3233 3234 3235
		kiocb->ki_flags |= IOCB_NOWAIT;
	} else {
		/* Ensure we clear previously set non-block flag */
		kiocb->ki_flags &= ~IOCB_NOWAIT;
	}
3236

3237
	ppos = io_kiocb_update_pos(req);
3238

3239
	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
3240 3241
	if (unlikely(ret))
		goto out_free;
3242

3243 3244 3245 3246 3247 3248 3249 3250
	/*
	 * Open-code file_start_write here to grab freeze protection,
	 * which will be released by another thread in
	 * io_complete_rw().  Fool lockdep by telling it the lock got
	 * released so that it doesn't complain about the held lock when
	 * we return to userspace.
	 */
	if (req->flags & REQ_F_ISREG) {
3251
		sb_start_write(file_inode(req->file)->i_sb);
3252 3253 3254 3255
		__sb_writers_release(file_inode(req->file)->i_sb,
					SB_FREEZE_WRITE);
	}
	kiocb->ki_flags |= IOCB_WRITE;
3256

3257
	if (likely(req->file->f_op->write_iter))
3258
		ret2 = call_write_iter(req->file, kiocb, &s->iter);
3259
	else if (req->file->f_op->write)
3260
		ret2 = loop_rw_iter(WRITE, rw, &s->iter);
3261 3262
	else
		ret2 = -EINVAL;
3263

3264 3265
	if (req->flags & REQ_F_REISSUE) {
		req->flags &= ~REQ_F_REISSUE;
3266
		ret2 = -EAGAIN;
3267
	}
3268

3269 3270 3271 3272 3273 3274
	/*
	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
	 * retry them without IOCB_NOWAIT.
	 */
	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
		ret2 = -EAGAIN;
3275 3276
	/* no retry on NONBLOCK nor RWF_NOWAIT */
	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3277
		goto done;
3278
	if (!force_nonblock || ret2 != -EAGAIN) {
3279
		/* IOPOLL retry should happen for io-wq threads */
3280
		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
3281
			goto copy_iov;
3282
done:
3283
		kiocb_done(req, ret2, issue_flags);
3284
		ret = IOU_ISSUE_SKIP_COMPLETE;
3285
	} else {
3286
copy_iov:
3287 3288
		iov_iter_restore(&s->iter, &s->iter_state);
		ret = io_setup_async_rw(req, iovec, s, false);
3289
		return ret ?: -EAGAIN;
J
Jens Axboe 已提交
3290
	}
3291
out_free:
3292
	/* it's reportedly faster than delegating the null check to kfree() */
3293
	if (iovec)
3294
		kfree(iovec);
J
Jens Axboe 已提交
3295 3296 3297
	return ret;
}

3298 3299 3300 3301
/*
 * Note when io_fixed_fd_install() returns error value, it will ensure
 * fput() is called correspondingly.
 */
3302 3303
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
			struct file *file, unsigned int file_slot)
3304 3305 3306 3307 3308
{
	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
	struct io_ring_ctx *ctx = req->ctx;
	int ret;

3309 3310
	io_ring_submit_lock(ctx, issue_flags);

3311 3312
	if (alloc_slot) {
		ret = io_file_bitmap_get(ctx);
3313 3314
		if (unlikely(ret < 0))
			goto err;
3315 3316 3317 3318 3319 3320
		file_slot = ret;
	} else {
		file_slot--;
	}

	ret = io_install_fixed_file(req, file, issue_flags, file_slot);
3321 3322 3323 3324 3325 3326
	if (!ret && alloc_slot)
		ret = file_slot;
err:
	io_ring_submit_unlock(ctx, issue_flags);
	if (unlikely(ret < 0))
		fput(file);
3327 3328 3329
	return ret;
}

3330 3331 3332
static int io_remove_buffers_prep(struct io_kiocb *req,
				  const struct io_uring_sqe *sqe)
{
3333
	struct io_provide_buf *p = io_kiocb_to_cmd(req);
3334 3335
	u64 tmp;

3336
	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
3337
	    sqe->splice_fd_in)
3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349
		return -EINVAL;

	tmp = READ_ONCE(sqe->fd);
	if (!tmp || tmp > USHRT_MAX)
		return -EINVAL;

	memset(p, 0, sizeof(*p));
	p->nbufs = tmp;
	p->bgid = READ_ONCE(sqe->buf_group);
	return 0;
}

3350 3351
static int __io_remove_buffers(struct io_ring_ctx *ctx,
			       struct io_buffer_list *bl, unsigned nbufs)
3352 3353 3354 3355 3356 3357 3358
{
	unsigned i = 0;

	/* shouldn't happen */
	if (!nbufs)
		return 0;

3359 3360 3361 3362 3363 3364 3365 3366 3367
	if (bl->buf_nr_pages) {
		int j;

		i = bl->buf_ring->tail - bl->head;
		for (j = 0; j < bl->buf_nr_pages; j++)
			unpin_user_page(bl->buf_pages[j]);
		kvfree(bl->buf_pages);
		bl->buf_pages = NULL;
		bl->buf_nr_pages = 0;
3368 3369
		/* make sure it's seen as empty */
		INIT_LIST_HEAD(&bl->buf_list);
3370 3371 3372
		return i;
	}

3373
	/* the head kbuf is the list itself */
3374
	while (!list_empty(&bl->buf_list)) {
3375 3376
		struct io_buffer *nxt;

3377
		nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
3378 3379 3380
		list_del(&nxt->list);
		if (++i == nbufs)
			return i;
3381
		cond_resched();
3382 3383 3384 3385 3386 3387
	}
	i++;

	return i;
}

3388
static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
3389
{
3390
	struct io_provide_buf *p = io_kiocb_to_cmd(req);
3391
	struct io_ring_ctx *ctx = req->ctx;
3392
	struct io_buffer_list *bl;
3393 3394
	int ret = 0;

3395
	io_ring_submit_lock(ctx, issue_flags);
3396 3397

	ret = -ENOENT;
3398
	bl = io_buffer_get_list(ctx, p->bgid);
3399 3400 3401 3402 3403 3404
	if (bl) {
		ret = -EINVAL;
		/* can't use provide/remove buffers command on mapped buffers */
		if (!bl->buf_nr_pages)
			ret = __io_remove_buffers(ctx, bl, p->nbufs);
	}
3405
	if (ret < 0)
3406
		req_set_fail(req);
3407

3408
	/* complete before unlock, IOPOLL may need the lock */
3409 3410
	io_req_set_res(req, ret, 0);
	__io_req_complete(req, issue_flags);
3411
	io_ring_submit_unlock(ctx, issue_flags);
3412
	return IOU_ISSUE_SKIP_COMPLETE;
3413 3414
}

3415 3416 3417
static int io_provide_buffers_prep(struct io_kiocb *req,
				   const struct io_uring_sqe *sqe)
{
3418
	unsigned long size, tmp_check;
3419
	struct io_provide_buf *p = io_kiocb_to_cmd(req);
3420 3421
	u64 tmp;

3422
	if (sqe->rw_flags || sqe->splice_fd_in)
3423 3424 3425 3426 3427 3428 3429 3430 3431
		return -EINVAL;

	tmp = READ_ONCE(sqe->fd);
	if (!tmp || tmp > USHRT_MAX)
		return -E2BIG;
	p->nbufs = tmp;
	p->addr = READ_ONCE(sqe->addr);
	p->len = READ_ONCE(sqe->len);

3432 3433 3434 3435 3436 3437
	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
				&size))
		return -EOVERFLOW;
	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
		return -EOVERFLOW;

3438 3439
	size = (unsigned long)p->len * p->nbufs;
	if (!access_ok(u64_to_user_ptr(p->addr), size))
3440 3441 3442 3443 3444 3445 3446 3447 3448 3449
		return -EFAULT;

	p->bgid = READ_ONCE(sqe->buf_group);
	tmp = READ_ONCE(sqe->off);
	if (tmp > USHRT_MAX)
		return -E2BIG;
	p->bid = tmp;
	return 0;
}

3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
{
	struct io_buffer *buf;
	struct page *page;
	int bufs_in_page;

	/*
	 * Completions that don't happen inline (eg not under uring_lock) will
	 * add to ->io_buffers_comp. If we don't have any free buffers, check
	 * the completion list and splice those entries first.
	 */
	if (!list_empty_careful(&ctx->io_buffers_comp)) {
		spin_lock(&ctx->completion_lock);
		if (!list_empty(&ctx->io_buffers_comp)) {
			list_splice_init(&ctx->io_buffers_comp,
						&ctx->io_buffers_cache);
			spin_unlock(&ctx->completion_lock);
			return 0;
		}
		spin_unlock(&ctx->completion_lock);
	}

	/*
	 * No free buffers and no completion entries either. Allocate a new
	 * page worth of buffer entries and add those to our freelist.
	 */
	page = alloc_page(GFP_KERNEL_ACCOUNT);
	if (!page)
		return -ENOMEM;

	list_add(&page->lru, &ctx->io_buffers_pages);

	buf = page_address(page);
	bufs_in_page = PAGE_SIZE / sizeof(*buf);
	while (bufs_in_page) {
		list_add_tail(&buf->list, &ctx->io_buffers_cache);
		buf++;
		bufs_in_page--;
	}

	return 0;
}

static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
3494
			  struct io_buffer_list *bl)
3495 3496 3497 3498 3499 3500
{
	struct io_buffer *buf;
	u64 addr = pbuf->addr;
	int i, bid = pbuf->bid;

	for (i = 0; i < pbuf->nbufs; i++) {
3501 3502
		if (list_empty(&ctx->io_buffers_cache) &&
		    io_refill_buffer_cache(ctx))
3503
			break;
3504 3505
		buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
					list);
3506
		list_move_tail(&buf->list, &bl->buf_list);
3507
		buf->addr = addr;
3508
		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
3509
		buf->bid = bid;
3510
		buf->bgid = pbuf->bgid;
3511 3512
		addr += pbuf->len;
		bid++;
3513
		cond_resched();
3514 3515
	}

3516
	return i ? 0 : -ENOMEM;
3517 3518
}

3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535
static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{
	int i;

	ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
				GFP_KERNEL);
	if (!ctx->io_bl)
		return -ENOMEM;

	for (i = 0; i < BGID_ARRAY; i++) {
		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
		ctx->io_bl[i].bgid = i;
	}

	return 0;
}

3536
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
3537
{
3538
	struct io_provide_buf *p = io_kiocb_to_cmd(req);
3539
	struct io_ring_ctx *ctx = req->ctx;
3540
	struct io_buffer_list *bl;
3541 3542
	int ret = 0;

3543
	io_ring_submit_lock(ctx, issue_flags);
3544

3545 3546 3547 3548 3549
	if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
		ret = io_init_bl_list(ctx);
		if (ret)
			goto err;
	}
3550

3551 3552
	bl = io_buffer_get_list(ctx, p->bgid);
	if (unlikely(!bl)) {
3553
		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
3554 3555 3556 3557
		if (!bl) {
			ret = -ENOMEM;
			goto err;
		}
3558
		INIT_LIST_HEAD(&bl->buf_list);
3559 3560 3561 3562 3563
		ret = io_buffer_add_list(ctx, bl, p->bgid);
		if (ret) {
			kfree(bl);
			goto err;
		}
3564
	}
3565 3566 3567 3568
	/* can't add buffers via this command for a mapped buffer ring */
	if (bl->buf_nr_pages) {
		ret = -EINVAL;
		goto err;
3569
	}
3570 3571 3572

	ret = io_add_buffers(ctx, p, bl);
err:
3573
	if (ret < 0)
3574
		req_set_fail(req);
3575
	/* complete before unlock, IOPOLL may need the lock */
3576 3577
	io_req_set_res(req, ret, 0);
	__io_req_complete(req, issue_flags);
3578
	io_ring_submit_unlock(ctx, issue_flags);
3579
	return IOU_ISSUE_SKIP_COMPLETE;
3580 3581
}

3582 3583 3584 3585 3586 3587
static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
					     const struct io_uring_sqe *sqe)
{
	return -EOPNOTSUPP;
}

3588 3589 3590
struct io_poll_table {
	struct poll_table_struct pt;
	struct io_kiocb *req;
3591
	int nr_entries;
3592 3593
	int error;
};
3594

P
Pavel Begunkov 已提交
3595
#define IO_POLL_CANCEL_FLAG	BIT(31)
3596
#define IO_POLL_REF_MASK	GENMASK(30, 0)
3597

P
Pavel Begunkov 已提交
3598 3599 3600 3601 3602 3603 3604 3605 3606
/*
 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
 * bump it and acquire ownership. It's disallowed to modify requests while not
 * owning it, that prevents from races for enqueueing task_work's and b/w
 * arming poll and wakeups.
 */
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
3607 3608
}

P
Pavel Begunkov 已提交
3609
static void io_poll_mark_cancelled(struct io_kiocb *req)
3610
{
P
Pavel Begunkov 已提交
3611
	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
3612 3613
}

3614
static struct io_poll *io_poll_get_double(struct io_kiocb *req)
3615
{
3616
	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
3617
	if (req->opcode == IORING_OP_POLL_ADD)
3618
		return req->async_data;
3619 3620 3621
	return req->apoll->double_poll;
}

3622
static struct io_poll *io_poll_get_single(struct io_kiocb *req)
3623 3624
{
	if (req->opcode == IORING_OP_POLL_ADD)
3625
		return io_kiocb_to_cmd(req);
3626 3627 3628
	return &req->apoll->poll;
}

P
Pavel Begunkov 已提交
3629
static void io_poll_req_insert(struct io_kiocb *req)
3630
{
P
Pavel Begunkov 已提交
3631 3632
	struct io_ring_ctx *ctx = req->ctx;
	struct hlist_head *list;
3633

3634
	list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
P
Pavel Begunkov 已提交
3635
	hlist_add_head(&req->hash_node, list);
3636 3637
}

3638
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
P
Pavel Begunkov 已提交
3639
			      wait_queue_func_t wake_func)
3640
{
P
Pavel Begunkov 已提交
3641 3642 3643 3644 3645 3646
	poll->head = NULL;
#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
	/* mask in events that we always want/need */
	poll->events = events | IO_POLL_UNMASK;
	INIT_LIST_HEAD(&poll->wait.entry);
	init_waitqueue_func_entry(&poll->wait, wake_func);
3647 3648
}

3649
static inline void io_poll_remove_entry(struct io_poll *poll)
3650
{
3651
	struct wait_queue_head *head = smp_load_acquire(&poll->head);
3652

3653 3654 3655 3656 3657 3658
	if (head) {
		spin_lock_irq(&head->lock);
		list_del_init(&poll->wait.entry);
		poll->head = NULL;
		spin_unlock_irq(&head->lock);
	}
P
Pavel Begunkov 已提交
3659
}
3660

P
Pavel Begunkov 已提交
3661 3662
static void io_poll_remove_entries(struct io_kiocb *req)
{
3663 3664 3665 3666 3667 3668
	/*
	 * Nothing to do if neither of those flags are set. Avoid dipping
	 * into the poll/apoll/double cachelines if we can.
	 */
	if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
		return;
3669

3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685
	/*
	 * While we hold the waitqueue lock and the waitqueue is nonempty,
	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
	 * lock in the first place can race with the waitqueue being freed.
	 *
	 * We solve this as eventpoll does: by taking advantage of the fact that
	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
	 * we enter rcu_read_lock() and see that the pointer to the queue is
	 * non-NULL, we can then lock it without the memory being freed out from
	 * under us.
	 *
	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
	 * case the caller deletes the entry from the queue, leaving it empty.
	 * In that case, only RCU prevents the queue memory from being freed.
	 */
	rcu_read_lock();
3686 3687 3688 3689
	if (req->flags & REQ_F_SINGLE_POLL)
		io_poll_remove_entry(io_poll_get_single(req));
	if (req->flags & REQ_F_DOUBLE_POLL)
		io_poll_remove_entry(io_poll_get_double(req));
3690
	rcu_read_unlock();
3691 3692
}

3693
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);
P
Pavel Begunkov 已提交
3694 3695 3696 3697 3698 3699
/*
 * All poll tw should go through this. Checks for poll events, manages
 * references, does rewait, etc.
 *
 * Returns a negative error on failure. >0 when no action require, which is
 * either spurious wakeup or multishot CQE is served. 0 when it's done with
3700
 * the request, then the mask is stored in req->cqe.res.
P
Pavel Begunkov 已提交
3701
 */
3702
static int io_poll_check_events(struct io_kiocb *req, bool *locked)
3703
{
3704
	struct io_ring_ctx *ctx = req->ctx;
3705
	int v, ret;
3706

3707
	/* req->task == current here, checking PF_EXITING is safe */
3708
	if (unlikely(req->task->flags & PF_EXITING))
3709
		return -ECANCELED;
3710

P
Pavel Begunkov 已提交
3711 3712
	do {
		v = atomic_read(&req->poll_refs);
3713

P
Pavel Begunkov 已提交
3714 3715 3716 3717 3718
		/* tw handler should be the owner, and so have some references */
		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
			return 0;
		if (v & IO_POLL_CANCEL_FLAG)
			return -ECANCELED;
3719

3720
		if (!req->cqe.res) {
J
Jens Axboe 已提交
3721
			struct poll_table_struct pt = { ._key = req->apoll_events };
3722
			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
3723
		}
3724

3725 3726 3727 3728 3729 3730 3731 3732 3733
		if ((unlikely(!req->cqe.res)))
			continue;
		if (req->apoll_events & EPOLLONESHOT)
			return 0;

		/* multishot, just fill a CQE and proceed */
		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
			__poll_t mask = mangle_poll(req->cqe.res &
						    req->apoll_events);
P
Pavel Begunkov 已提交
3734
			bool filled;
3735

P
Pavel Begunkov 已提交
3736
			spin_lock(&ctx->completion_lock);
3737 3738
			filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
						 mask, IORING_CQE_F_MORE);
P
Pavel Begunkov 已提交
3739 3740
			io_commit_cqring(ctx);
			spin_unlock(&ctx->completion_lock);
3741 3742 3743 3744 3745
			if (filled) {
				io_cqring_ev_posted(ctx);
				continue;
			}
			return -ECANCELED;
P
Pavel Begunkov 已提交
3746
		}
3747

3748 3749 3750 3751 3752 3753 3754 3755
		io_tw_lock(req->ctx, locked);
		if (unlikely(req->task->flags & PF_EXITING))
			return -EFAULT;
		ret = io_issue_sqe(req,
				   IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
		if (ret)
			return ret;

P
Pavel Begunkov 已提交
3756 3757 3758 3759 3760
		/*
		 * Release all references, retry if someone tried to restart
		 * task_work while we were executing it.
		 */
	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
3761 3762 3763 3764

	return 1;
}

P
Pavel Begunkov 已提交
3765
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
3766 3767
{
	struct io_ring_ctx *ctx = req->ctx;
P
Pavel Begunkov 已提交
3768
	int ret;
3769

3770
	ret = io_poll_check_events(req, locked);
P
Pavel Begunkov 已提交
3771 3772 3773 3774
	if (ret > 0)
		return;

	if (!ret) {
3775 3776 3777
		struct io_poll *poll = io_kiocb_to_cmd(req);

		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
3778
	} else {
3779
		req->cqe.res = ret;
P
Pavel Begunkov 已提交
3780
		req_set_fail(req);
3781
	}
P
Pavel Begunkov 已提交
3782 3783 3784 3785

	io_poll_remove_entries(req);
	spin_lock(&ctx->completion_lock);
	hash_del(&req->hash_node);
3786 3787
	req->cqe.flags = 0;
	__io_req_complete_post(req);
P
Pavel Begunkov 已提交
3788 3789 3790
	io_commit_cqring(ctx);
	spin_unlock(&ctx->completion_lock);
	io_cqring_ev_posted(ctx);
3791 3792
}

P
Pavel Begunkov 已提交
3793
static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
3794 3795
{
	struct io_ring_ctx *ctx = req->ctx;
P
Pavel Begunkov 已提交
3796
	int ret;
3797

3798
	ret = io_poll_check_events(req, locked);
P
Pavel Begunkov 已提交
3799 3800
	if (ret > 0)
		return;
3801

P
Pavel Begunkov 已提交
3802 3803 3804 3805
	io_poll_remove_entries(req);
	spin_lock(&ctx->completion_lock);
	hash_del(&req->hash_node);
	spin_unlock(&ctx->completion_lock);
3806

P
Pavel Begunkov 已提交
3807 3808 3809 3810
	if (!ret)
		io_req_task_submit(req, locked);
	else
		io_req_complete_failed(req, ret);
3811 3812
}

P
Pavel Begunkov 已提交
3813 3814
static void __io_poll_execute(struct io_kiocb *req, int mask,
			      __poll_t __maybe_unused events)
P
Pavel Begunkov 已提交
3815
{
3816
	io_req_set_res(req, mask, 0);
3817 3818 3819 3820 3821 3822
	/*
	 * This is useful for poll that is armed on behalf of another
	 * request, and where the wakeup path could be on a different
	 * CPU. We want to avoid pulling in req->apoll->events for that
	 * case.
	 */
P
Pavel Begunkov 已提交
3823 3824 3825 3826 3827
	if (req->opcode == IORING_OP_POLL_ADD)
		req->io_task_work.func = io_poll_task_func;
	else
		req->io_task_work.func = io_apoll_task_func;

3828
	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
3829
	io_req_task_work_add(req);
P
Pavel Begunkov 已提交
3830 3831
}

3832 3833
static inline void io_poll_execute(struct io_kiocb *req, int res,
		__poll_t events)
P
Pavel Begunkov 已提交
3834 3835
{
	if (io_poll_get_ownership(req))
3836
		__io_poll_execute(req, res, events);
P
Pavel Begunkov 已提交
3837 3838 3839 3840 3841 3842
}

static void io_poll_cancel_req(struct io_kiocb *req)
{
	io_poll_mark_cancelled(req);
	/* kick tw, which should complete the request */
3843
	io_poll_execute(req, 0, 0);
P
Pavel Begunkov 已提交
3844 3845
}

3846 3847
#define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))
#define wqe_is_double(wait)	((unsigned long) (wait)->private & 1)
3848
#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)
3849

P
Pavel Begunkov 已提交
3850 3851
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
			void *key)
3852
{
3853
	struct io_kiocb *req = wqe_to_req(wait);
3854
	struct io_poll *poll = container_of(wait, struct io_poll, wait);
3855 3856
	__poll_t mask = key_to_poll(key);

3857 3858 3859
	if (unlikely(mask & POLLFREE)) {
		io_poll_mark_cancelled(req);
		/* we have to kick tw in case it's not already */
3860
		io_poll_execute(req, 0, poll->events);
3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880

		/*
		 * If the waitqueue is being freed early but someone is already
		 * holds ownership over it, we have to tear down the request as
		 * best we can. That means immediately removing the request from
		 * its waitqueue and preventing all further accesses to the
		 * waitqueue via the request.
		 */
		list_del_init(&poll->wait.entry);

		/*
		 * Careful: this *must* be the last step, since as soon
		 * as req->head is NULL'ed out, the request can be
		 * completed and freed, since aio_poll_complete_work()
		 * will no longer need to take the waitqueue lock.
		 */
		smp_store_release(&poll->head, NULL);
		return 1;
	}

P
Pavel Begunkov 已提交
3881
	/* for instances that support it check for an event match first */
3882
	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
3883 3884
		return 0;

3885 3886 3887 3888 3889
	if (io_poll_get_ownership(req)) {
		/* optional, saves extra locking for removal in tw handler */
		if (mask && poll->events & EPOLLONESHOT) {
			list_del_init(&poll->wait.entry);
			poll->head = NULL;
3890 3891 3892 3893
			if (wqe_is_double(wait))
				req->flags &= ~REQ_F_DOUBLE_POLL;
			else
				req->flags &= ~REQ_F_SINGLE_POLL;
3894
		}
3895
		__io_poll_execute(req, mask, poll->events);
3896
	}
3897 3898 3899
	return 1;
}

3900
static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
3901
			    struct wait_queue_head *head,
3902
			    struct io_poll **poll_ptr)
3903 3904
{
	struct io_kiocb *req = pt->req;
3905
	unsigned long wqe_private = (unsigned long) req;
3906 3907

	/*
3908
	 * The file being polled uses multiple waitqueues for poll handling
3909
	 * (e.g. one for read, one for write). Setup a separate io_poll
3910
	 * if this happens.
3911
	 */
3912
	if (unlikely(pt->nr_entries)) {
3913
		struct io_poll *first = poll;
3914

3915
		/* double add on the same waitqueue head, ignore */
P
Pavel Begunkov 已提交
3916
		if (first->head == head)
3917
			return;
3918
		/* already have a 2nd entry, fail a third attempt */
3919
		if (*poll_ptr) {
3920 3921
			if ((*poll_ptr)->head == head)
				return;
3922 3923 3924
			pt->error = -EINVAL;
			return;
		}
P
Pavel Begunkov 已提交
3925

3926 3927 3928 3929 3930
		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
		if (!poll) {
			pt->error = -ENOMEM;
			return;
		}
3931 3932
		/* mark as double wq entry */
		wqe_private |= 1;
3933
		req->flags |= REQ_F_DOUBLE_POLL;
P
Pavel Begunkov 已提交
3934
		io_init_poll_iocb(poll, first->events, first->wait.func);
3935
		*poll_ptr = poll;
3936 3937
		if (req->opcode == IORING_OP_POLL_ADD)
			req->flags |= REQ_F_ASYNC_DATA;
3938 3939
	}

3940
	req->flags |= REQ_F_SINGLE_POLL;
3941
	pt->nr_entries++;
3942
	poll->head = head;
3943
	poll->wait.private = (void *) wqe_private;
3944 3945 3946 3947 3948

	if (poll->events & EPOLLEXCLUSIVE)
		add_wait_queue_exclusive(head, &poll->wait);
	else
		add_wait_queue(head, &poll->wait);
3949 3950
}

P
Pavel Begunkov 已提交
3951
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
3952 3953 3954
			       struct poll_table_struct *p)
{
	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
3955
	struct io_poll *poll = io_kiocb_to_cmd(pt->req);
3956

3957 3958
	__io_queue_proc(poll, pt, head,
			(struct io_poll **) &pt->req->async_data);
3959 3960
}

P
Pavel Begunkov 已提交
3961
static int __io_arm_poll_handler(struct io_kiocb *req,
3962
				 struct io_poll *poll,
P
Pavel Begunkov 已提交
3963
				 struct io_poll_table *ipt, __poll_t mask)
3964 3965
{
	struct io_ring_ctx *ctx = req->ctx;
P
Pavel Begunkov 已提交
3966
	int v;
3967

3968
	INIT_HLIST_NODE(&req->hash_node);
3969
	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
P
Pavel Begunkov 已提交
3970
	io_init_poll_iocb(poll, mask, io_poll_wake);
3971
	poll->file = req->file;
3972

P
Pavel Begunkov 已提交
3973 3974
	req->apoll_events = poll->events;

3975 3976
	ipt->pt._key = mask;
	ipt->req = req;
3977 3978
	ipt->error = 0;
	ipt->nr_entries = 0;
3979

P
Pavel Begunkov 已提交
3980 3981 3982 3983 3984
	/*
	 * Take the ownership to delay any tw execution up until we're done
	 * with poll arming. see io_poll_get_ownership().
	 */
	atomic_set(&req->poll_refs, 1);
3985
	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
P
Pavel Begunkov 已提交
3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997

	if (mask && (poll->events & EPOLLONESHOT)) {
		io_poll_remove_entries(req);
		/* no one else has access to the req, forget about the ref */
		return mask;
	}
	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
		io_poll_remove_entries(req);
		if (!ipt->error)
			ipt->error = -EINVAL;
		return 0;
	}
3998

3999
	spin_lock(&ctx->completion_lock);
P
Pavel Begunkov 已提交
4000 4001 4002 4003 4004
	io_poll_req_insert(req);
	spin_unlock(&ctx->completion_lock);

	if (mask) {
		/* can't multishot if failed, just queue the event we've got */
P
Pavel Begunkov 已提交
4005
		if (unlikely(ipt->error || !ipt->nr_entries)) {
P
Pavel Begunkov 已提交
4006
			poll->events |= EPOLLONESHOT;
P
Pavel Begunkov 已提交
4007
			req->apoll_events |= EPOLLONESHOT;
4008
			ipt->error = 0;
P
Pavel Begunkov 已提交
4009
		}
4010
		__io_poll_execute(req, mask, poll->events);
P
Pavel Begunkov 已提交
4011
		return 0;
4012 4013
	}

P
Pavel Begunkov 已提交
4014 4015 4016 4017 4018 4019
	/*
	 * Release ownership. If someone tried to queue a tw while it was
	 * locked, kick it off for them.
	 */
	v = atomic_dec_return(&req->poll_refs);
	if (unlikely(v & IO_POLL_REF_MASK))
4020
		__io_poll_execute(req, 0, poll->events);
P
Pavel Begunkov 已提交
4021 4022 4023 4024 4025 4026 4027 4028 4029 4030
	return 0;
}

static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
			       struct poll_table_struct *p)
{
	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
	struct async_poll *apoll = pt->req->apoll;

	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
4031 4032
}

4033 4034 4035 4036 4037 4038
enum {
	IO_APOLL_OK,
	IO_APOLL_ABORTED,
	IO_APOLL_READY
};

4039
static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
4040 4041 4042 4043 4044
{
	const struct io_op_def *def = &io_op_defs[req->opcode];
	struct io_ring_ctx *ctx = req->ctx;
	struct async_poll *apoll;
	struct io_poll_table ipt;
4045
	__poll_t mask = POLLPRI | POLLERR;
P
Pavel Begunkov 已提交
4046
	int ret;
4047

4048 4049
	if (!def->pollin && !def->pollout)
		return IO_APOLL_ABORTED;
4050
	if (!file_can_poll(req->file))
4051
		return IO_APOLL_ABORTED;
4052
	if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
4053
		return IO_APOLL_ABORTED;
4054 4055
	if (!(req->flags & REQ_F_APOLL_MULTISHOT))
		mask |= EPOLLONESHOT;
4056 4057

	if (def->pollin) {
4058
		mask |= EPOLLIN | EPOLLRDNORM;
4059 4060

		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
4061
		if (req->flags & REQ_F_CLEAR_POLLIN)
4062
			mask &= ~EPOLLIN;
4063
	} else {
4064
		mask |= EPOLLOUT | EPOLLWRNORM;
4065
	}
4066 4067
	if (def->poll_exclusive)
		mask |= EPOLLEXCLUSIVE;
4068 4069
	if (req->flags & REQ_F_POLLED) {
		apoll = req->apoll;
4070
		kfree(apoll->double_poll);
4071 4072
	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
		   !list_empty(&ctx->apoll_cache)) {
4073 4074 4075 4076 4077 4078 4079 4080
		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
						poll.wait.entry);
		list_del_init(&apoll->poll.wait.entry);
	} else {
		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
		if (unlikely(!apoll))
			return IO_APOLL_ABORTED;
	}
4081
	apoll->double_poll = NULL;
4082
	req->apoll = apoll;
4083
	req->flags |= REQ_F_POLLED;
4084 4085
	ipt.pt._qproc = io_async_queue_proc;

4086
	io_kbuf_recycle(req, issue_flags);
4087

P
Pavel Begunkov 已提交
4088
	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
4089 4090 4091
	if (ret || ipt.error)
		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;

4092
	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
4093
				mask, apoll->poll.events);
4094
	return IO_APOLL_OK;
4095 4096
}

4097 4098 4099
/*
 * Returns true if we found and killed one or more poll requests
 */
P
Pavel Begunkov 已提交
4100 4101
static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
				      struct task_struct *tsk, bool cancel_all)
4102
{
4103
	struct hlist_node *tmp;
4104
	struct io_kiocb *req;
P
Pavel Begunkov 已提交
4105 4106
	bool found = false;
	int i;
4107

4108
	spin_lock(&ctx->completion_lock);
4109 4110 4111 4112
	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
		struct hlist_head *list;

		list = &ctx->cancel_hash[i];
4113
		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
4114
			if (io_match_task_safe(req, tsk, cancel_all)) {
4115
				hlist_del_init(&req->hash_node);
P
Pavel Begunkov 已提交
4116 4117 4118
				io_poll_cancel_req(req);
				found = true;
			}
4119
		}
4120
	}
4121
	spin_unlock(&ctx->completion_lock);
P
Pavel Begunkov 已提交
4122
	return found;
4123 4124
}

4125 4126
static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
				     struct io_cancel_data *cd)
4127
	__must_hold(&ctx->completion_lock)
4128
{
4129
	struct hlist_head *list;
4130 4131
	struct io_kiocb *req;

4132
	list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
4133
	hlist_for_each_entry(req, list, hash_node) {
4134
		if (cd->data != req->cqe.user_data)
4135
			continue;
4136 4137
		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
			continue;
4138 4139 4140 4141 4142
		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
			if (cd->seq == req->work.cancel_seq)
				continue;
			req->work.cancel_seq = cd->seq;
		}
4143
		return req;
4144
	}
4145 4146 4147
	return NULL;
}

4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159
static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
					  struct io_cancel_data *cd)
	__must_hold(&ctx->completion_lock)
{
	struct io_kiocb *req;
	int i;

	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
		struct hlist_head *list;

		list = &ctx->cancel_hash[i];
		hlist_for_each_entry(req, list, hash_node) {
4160 4161
			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
			    req->file != cd->file)
4162 4163 4164 4165 4166 4167 4168 4169 4170 4171
				continue;
			if (cd->seq == req->work.cancel_seq)
				continue;
			req->work.cancel_seq = cd->seq;
			return req;
		}
	}
	return NULL;
}

P
Pavel Begunkov 已提交
4172 4173 4174 4175 4176 4177 4178 4179 4180 4181
static bool io_poll_disarm(struct io_kiocb *req)
	__must_hold(&ctx->completion_lock)
{
	if (!io_poll_get_ownership(req))
		return false;
	io_poll_remove_entries(req);
	hash_del(&req->hash_node);
	return true;
}

4182
static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
4183
	__must_hold(&ctx->completion_lock)
4184
{
4185
	struct io_kiocb *req;
4186

4187
	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
4188 4189 4190
		req = io_poll_file_find(ctx, cd);
	else
		req = io_poll_find(ctx, false, cd);
4191 4192
	if (!req)
		return -ENOENT;
P
Pavel Begunkov 已提交
4193 4194
	io_poll_cancel_req(req);
	return 0;
4195 4196
}

4197 4198 4199 4200
static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
				     unsigned int flags)
{
	u32 events;
4201

4202 4203 4204 4205 4206 4207 4208
	events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
	events = swahw32(events);
#endif
	if (!(flags & IORING_POLL_ADD_MULTI))
		events |= EPOLLONESHOT;
	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
4209 4210
}

4211
static int io_poll_remove_prep(struct io_kiocb *req,
4212
			       const struct io_uring_sqe *sqe)
4213
{
4214
	struct io_poll_update *upd = io_kiocb_to_cmd(req);
4215 4216
	u32 flags;

4217
	if (sqe->buf_index || sqe->splice_fd_in)
4218 4219 4220 4221 4222 4223 4224
		return -EINVAL;
	flags = READ_ONCE(sqe->len);
	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
		      IORING_POLL_ADD_MULTI))
		return -EINVAL;
	/* meaningless without update */
	if (flags == IORING_POLL_ADD_MULTI)
4225 4226
		return -EINVAL;

4227 4228 4229
	upd->old_user_data = READ_ONCE(sqe->addr);
	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
4230

4231 4232 4233 4234 4235 4236 4237
	upd->new_user_data = READ_ONCE(sqe->off);
	if (!upd->update_user_data && upd->new_user_data)
		return -EINVAL;
	if (upd->update_events)
		upd->events = io_poll_parse_events(sqe, flags);
	else if (sqe->poll32_events)
		return -EINVAL;
4238 4239 4240 4241

	return 0;
}

4242
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4243
{
4244
	struct io_poll *poll = io_kiocb_to_cmd(req);
4245
	u32 flags;
4246

4247
	if (sqe->buf_index || sqe->off || sqe->addr)
4248 4249
		return -EINVAL;
	flags = READ_ONCE(sqe->len);
4250
	if (flags & ~IORING_POLL_ADD_MULTI)
4251
		return -EINVAL;
4252 4253
	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
		return -EINVAL;
4254

4255
	io_req_set_refcount(req);
P
Pavel Begunkov 已提交
4256
	poll->events = io_poll_parse_events(sqe, flags);
4257 4258 4259
	return 0;
}

4260
static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
4261
{
4262
	struct io_poll *poll = io_kiocb_to_cmd(req);
4263
	struct io_poll_table ipt;
P
Pavel Begunkov 已提交
4264
	int ret;
4265

4266
	ipt.pt._qproc = io_poll_queue_proc;
4267

4268
	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events);
4269 4270 4271 4272 4273
	if (ret) {
		io_req_set_res(req, ret, 0);
		return IOU_OK;
	}
	if (ipt.error) {
4274
		req_set_fail(req);
4275 4276 4277 4278
		return ipt.error;
	}

	return IOU_ISSUE_SKIP_COMPLETE;
4279 4280
}

4281
static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
4282
{
4283 4284
	struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
4285 4286
	struct io_ring_ctx *ctx = req->ctx;
	struct io_kiocb *preq;
P
Pavel Begunkov 已提交
4287
	int ret2, ret = 0;
4288
	bool locked;
4289

4290
	spin_lock(&ctx->completion_lock);
4291
	preq = io_poll_find(ctx, true, &cd);
P
Pavel Begunkov 已提交
4292
	if (!preq || !io_poll_disarm(preq)) {
4293
		spin_unlock(&ctx->completion_lock);
P
Pavel Begunkov 已提交
4294
		ret = preq ? -EALREADY : -ENOENT;
P
Pavel Begunkov 已提交
4295
		goto out;
4296
	}
4297
	spin_unlock(&ctx->completion_lock);
4298

4299
	if (poll_update->update_events || poll_update->update_user_data) {
P
Pavel Begunkov 已提交
4300
		/* only mask one event flags, keep behavior flags */
4301
		if (poll_update->update_events) {
4302 4303 4304
			struct io_poll *poll = io_kiocb_to_cmd(preq);

			poll->events &= ~0xffff;
4305
			poll->events |= poll_update->events & 0xffff;
4306
			poll->events |= IO_POLL_UNMASK;
4307
		}
4308 4309
		if (poll_update->update_user_data)
			preq->cqe.user_data = poll_update->new_user_data;
4310

P
Pavel Begunkov 已提交
4311 4312
		ret2 = io_poll_add(preq, issue_flags);
		/* successfully updated, don't complete poll request */
4313
		if (!ret2 || ret2 == -EIOCBQUEUED)
P
Pavel Begunkov 已提交
4314
			goto out;
4315
	}
4316

P
Pavel Begunkov 已提交
4317
	req_set_fail(preq);
4318
	io_req_set_res(preq, -ECANCELED, 0);
4319 4320
	locked = !(issue_flags & IO_URING_F_UNLOCKED);
	io_req_task_complete(preq, &locked);
P
Pavel Begunkov 已提交
4321
out:
4322
	if (ret < 0) {
4323
		req_set_fail(req);
4324 4325
		return ret;
	}
P
Pavel Begunkov 已提交
4326
	/* complete update request, we're done with it */
4327 4328
	io_req_set_res(req, ret, 0);
	return IOU_OK;
4329 4330
}

4331 4332 4333
static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4334
	struct io_cancel_data *cd = data;
4335

4336 4337
	if (req->ctx != cd->ctx)
		return false;
4338 4339 4340
	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
		;
	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
4341 4342 4343 4344 4345 4346
		if (req->file != cd->file)
			return false;
	} else {
		if (req->cqe.user_data != cd->data)
			return false;
	}
4347
	if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
4348 4349 4350 4351 4352
		if (cd->seq == req->work.cancel_seq)
			return false;
		req->work.cancel_seq = cd->seq;
	}
	return true;
4353 4354
}

4355 4356
static int io_async_cancel_one(struct io_uring_task *tctx,
			       struct io_cancel_data *cd)
4357 4358 4359
{
	enum io_wq_cancel cancel_ret;
	int ret = 0;
4360
	bool all;
4361

4362
	if (!tctx || !tctx->io_wq)
4363 4364
		return -ENOENT;

4365 4366
	all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378
	switch (cancel_ret) {
	case IO_WQ_CANCEL_OK:
		ret = 0;
		break;
	case IO_WQ_CANCEL_RUNNING:
		ret = -EALREADY;
		break;
	case IO_WQ_CANCEL_NOTFOUND:
		ret = -ENOENT;
		break;
	}

4379 4380 4381
	return ret;
}

4382
int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
4383
{
4384
	struct io_ring_ctx *ctx = req->ctx;
4385 4386
	int ret;

4387
	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
4388

4389
	ret = io_async_cancel_one(req->task->io_uring, cd);
4390 4391 4392 4393 4394 4395
	/*
	 * Fall-through even for -EALREADY, as we may have poll armed
	 * that need unarming.
	 */
	if (!ret)
		return 0;
4396 4397

	spin_lock(&ctx->completion_lock);
4398
	ret = io_poll_cancel(ctx, cd);
4399 4400
	if (ret != -ENOENT)
		goto out;
4401 4402
	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
		ret = io_timeout_cancel(ctx, cd);
4403 4404 4405
out:
	spin_unlock(&ctx->completion_lock);
	return ret;
4406 4407
}

4408 4409 4410
#define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
			 IORING_ASYNC_CANCEL_ANY)

4411 4412
static int io_async_cancel_prep(struct io_kiocb *req,
				const struct io_uring_sqe *sqe)
4413
{
4414 4415
	struct io_cancel *cancel = io_kiocb_to_cmd(req);

4416
	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
4417
		return -EINVAL;
4418
	if (sqe->off || sqe->len || sqe->splice_fd_in)
4419 4420
		return -EINVAL;

4421 4422 4423
	cancel->addr = READ_ONCE(sqe->addr);
	cancel->flags = READ_ONCE(sqe->cancel_flags);
	if (cancel->flags & ~CANCEL_FLAGS)
4424
		return -EINVAL;
4425 4426
	if (cancel->flags & IORING_ASYNC_CANCEL_FD) {
		if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
4427
			return -EINVAL;
4428
		cancel->fd = READ_ONCE(sqe->fd);
4429
	}
4430

4431 4432 4433
	return 0;
}

4434 4435
static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
			     unsigned int issue_flags)
4436
{
4437
	bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
4438
	struct io_ring_ctx *ctx = cd->ctx;
4439
	struct io_tctx_node *node;
4440
	int ret, nr = 0;
4441

4442 4443 4444 4445
	do {
		ret = io_try_cancel(req, cd);
		if (ret == -ENOENT)
			break;
4446
		if (!all)
4447 4448 4449
			return ret;
		nr++;
	} while (1);
4450 4451

	/* slow path, try all io-wq's */
4452
	io_ring_submit_lock(ctx, issue_flags);
4453 4454 4455
	ret = -ENOENT;
	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
		struct io_uring_task *tctx = node->task->io_uring;
4456

4457 4458
		ret = io_async_cancel_one(tctx, cd);
		if (ret != -ENOENT) {
4459
			if (!all)
4460 4461 4462
				break;
			nr++;
		}
4463
	}
4464
	io_ring_submit_unlock(ctx, issue_flags);
4465
	return all ? nr : ret;
4466 4467 4468 4469
}

static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
4470
	struct io_cancel *cancel = io_kiocb_to_cmd(req);
4471 4472
	struct io_cancel_data cd = {
		.ctx	= req->ctx,
4473 4474
		.data	= cancel->addr,
		.flags	= cancel->flags,
4475 4476 4477 4478
		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
	};
	int ret;

4479 4480
	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
		if (req->flags & REQ_F_FIXED_FILE)
4481
			req->file = io_file_get_fixed(req, cancel->fd,
4482 4483
							issue_flags);
		else
4484
			req->file = io_file_get_normal(req, cancel->fd);
4485 4486 4487 4488 4489
		if (!req->file) {
			ret = -EBADF;
			goto done;
		}
		cd.file = req->file;
4490
	}
4491

4492
	ret = __io_async_cancel(&cd, req, issue_flags);
4493 4494
done:
	if (ret < 0)
4495
		req_set_fail(req);
4496 4497
	io_req_set_res(req, ret, 0);
	return IOU_OK;
J
Jens Axboe 已提交
4498 4499
}

4500
static int io_files_update_prep(struct io_kiocb *req,
4501 4502
				const struct io_uring_sqe *sqe)
{
4503 4504
	struct io_rsrc_update *up = io_kiocb_to_cmd(req);

4505 4506
	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
		return -EINVAL;
4507
	if (sqe->rw_flags || sqe->splice_fd_in)
4508 4509
		return -EINVAL;

4510 4511 4512
	up->offset = READ_ONCE(sqe->off);
	up->nr_args = READ_ONCE(sqe->len);
	if (!up->nr_args)
4513
		return -EINVAL;
4514
	up->arg = READ_ONCE(sqe->addr);
4515 4516 4517
	return 0;
}

4518 4519 4520
static int io_files_update_with_index_alloc(struct io_kiocb *req,
					    unsigned int issue_flags)
{
4521 4522
	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
	__s32 __user *fds = u64_to_user_ptr(up->arg);
4523 4524 4525 4526
	unsigned int done;
	struct file *file;
	int ret, fd;

4527 4528 4529
	if (!req->ctx->file_data)
		return -ENXIO;

4530
	for (done = 0; done < up->nr_args; done++) {
4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546
		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
			ret = -EFAULT;
			break;
		}

		file = fget(fd);
		if (!file) {
			ret = -EBADF;
			break;
		}
		ret = io_fixed_fd_install(req, issue_flags, file,
					  IORING_FILE_INDEX_ALLOC);
		if (ret < 0)
			break;
		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
			__io_close_fixed(req, issue_flags, ret);
4547
			ret = -EFAULT;
4548 4549 4550 4551 4552 4553 4554 4555 4556
			break;
		}
	}

	if (done)
		return done;
	return ret;
}

4557
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
4558
{
4559
	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
4560
	struct io_ring_ctx *ctx = req->ctx;
4561
	struct io_uring_rsrc_update2 up2;
4562
	int ret;
4563

4564 4565 4566 4567 4568 4569
	up2.offset = up->offset;
	up2.data = up->arg;
	up2.nr = 0;
	up2.tags = 0;
	up2.resv = 0;
	up2.resv2 = 0;
4570

4571
	if (up->offset == IORING_FILE_INDEX_ALLOC) {
4572 4573 4574 4575
		ret = io_files_update_with_index_alloc(req, issue_flags);
	} else {
		io_ring_submit_lock(ctx, issue_flags);
		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
4576
						&up2, up->nr_args);
4577 4578
		io_ring_submit_unlock(ctx, issue_flags);
	}
4579 4580

	if (ret < 0)
4581
		req_set_fail(req);
4582 4583
	io_req_set_res(req, ret, 0);
	return IOU_OK;
J
Jens Axboe 已提交
4584 4585
}

4586
static int io_req_prep_async(struct io_kiocb *req)
4587
{
4588 4589 4590 4591 4592
	const struct io_op_def *def = &io_op_defs[req->opcode];

	/* assign early for deferred execution for non-fixed file */
	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
		req->file = io_file_get_normal(req, req->cqe.fd);
4593
	if (!def->prep_async)
4594 4595 4596 4597 4598 4599
		return 0;
	if (WARN_ON_ONCE(req_has_async_data(req)))
		return -EFAULT;
	if (io_alloc_async_data(req))
		return -EAGAIN;

4600
	return def->prep_async(req);
4601 4602
}

4603 4604
static u32 io_get_sequence(struct io_kiocb *req)
{
4605
	u32 seq = req->ctx->cached_sq_head;
4606
	struct io_kiocb *cur;
4607

4608
	/* need original cached_sq_head, but it was increased for each req */
4609
	io_for_each_link(cur, req)
4610 4611
		seq--;
	return seq;
4612 4613
}

P
Pavel Begunkov 已提交
4614
static __cold void io_drain_req(struct io_kiocb *req)
4615
{
4616
	struct io_ring_ctx *ctx = req->ctx;
4617
	struct io_defer_entry *de;
4618
	int ret;
4619
	u32 seq = io_get_sequence(req);
4620

B
Bob Liu 已提交
4621
	/* Still need defer if there is pending req in defer list. */
4622
	spin_lock(&ctx->completion_lock);
4623
	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
4624
		spin_unlock(&ctx->completion_lock);
4625
queue:
4626
		ctx->drain_active = false;
4627 4628
		io_req_task_queue(req);
		return;
4629
	}
4630
	spin_unlock(&ctx->completion_lock);
4631

4632
	ret = io_req_prep_async(req);
4633 4634 4635 4636 4637
	if (ret) {
fail:
		io_req_complete_failed(req, ret);
		return;
	}
4638
	io_prep_async_link(req);
4639
	de = kmalloc(sizeof(*de), GFP_KERNEL);
4640
	if (!de) {
P
Pavel Begunkov 已提交
4641
		ret = -ENOMEM;
4642
		goto fail;
4643
	}
4644

4645
	spin_lock(&ctx->completion_lock);
4646
	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
4647
		spin_unlock(&ctx->completion_lock);
4648
		kfree(de);
4649
		goto queue;
4650 4651
	}

4652
	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
4653
	de->req = req;
4654
	de->seq = seq;
4655
	list_add_tail(&de->list, &ctx->defer_list);
4656
	spin_unlock(&ctx->completion_lock);
4657 4658
}

4659
static void io_clean_op(struct io_kiocb *req)
P
Pavel Begunkov 已提交
4660
{
4661 4662
	if (req->flags & REQ_F_BUFFER_SELECTED) {
		spin_lock(&req->ctx->completion_lock);
4663
		io_put_kbuf_comp(req);
4664 4665
		spin_unlock(&req->ctx->completion_lock);
	}
P
Pavel Begunkov 已提交
4666

4667
	if (req->flags & REQ_F_NEED_CLEANUP) {
4668
		const struct io_op_def *def = &io_op_defs[req->opcode];
4669

4670 4671
		if (def->cleanup)
			def->cleanup(req);
P
Pavel Begunkov 已提交
4672
	}
4673 4674 4675 4676 4677
	if ((req->flags & REQ_F_POLLED) && req->apoll) {
		kfree(req->apoll->double_poll);
		kfree(req->apoll);
		req->apoll = NULL;
	}
4678 4679 4680 4681 4682
	if (req->flags & REQ_F_INFLIGHT) {
		struct io_uring_task *tctx = req->task->io_uring;

		atomic_dec(&tctx->inflight_tracked);
	}
4683
	if (req->flags & REQ_F_CREDS)
4684
		put_cred(req->creds);
4685 4686 4687 4688
	if (req->flags & REQ_F_ASYNC_DATA) {
		kfree(req->async_data);
		req->async_data = NULL;
	}
4689
	req->flags &= ~IO_REQ_CLEAN_FLAGS;
P
Pavel Begunkov 已提交
4690 4691
}

J
Jens Axboe 已提交
4692 4693 4694 4695 4696 4697
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
{
	if (req->file || !io_op_defs[req->opcode].needs_file)
		return true;

	if (req->flags & REQ_F_FIXED_FILE)
4698
		req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
J
Jens Axboe 已提交
4699
	else
4700
		req->file = io_file_get_normal(req, req->cqe.fd);
J
Jens Axboe 已提交
4701

4702
	return !!req->file;
J
Jens Axboe 已提交
4703 4704
}

4705
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
J
Jens Axboe 已提交
4706
{
4707
	const struct io_op_def *def = &io_op_defs[req->opcode];
4708
	const struct cred *creds = NULL;
4709
	int ret;
J
Jens Axboe 已提交
4710

4711 4712 4713
	if (unlikely(!io_assign_file(req, issue_flags)))
		return -EBADF;

4714
	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
4715
		creds = override_creds(req->creds);
4716

4717
	if (!def->audit_skip)
4718 4719
		audit_uring_entry(req->opcode);

4720
	ret = def->issue(req, issue_flags);
J
Jens Axboe 已提交
4721

4722
	if (!def->audit_skip)
4723 4724
		audit_uring_exit(!ret, ret);

4725 4726
	if (creds)
		revert_creds(creds);
4727 4728 4729 4730

	if (ret == IOU_OK)
		__io_req_complete(req, issue_flags);
	else if (ret != IOU_ISSUE_SKIP_COMPLETE)
J
Jens Axboe 已提交
4731
		return ret;
4732

4733
	/* If the op doesn't have a file, we're not polling for it */
4734
	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
4735
		io_iopoll_req_issued(req, issue_flags);
J
Jens Axboe 已提交
4736 4737

	return 0;
J
Jens Axboe 已提交
4738 4739
}

P
Pavel Begunkov 已提交
4740 4741 4742 4743 4744 4745 4746 4747
static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);

	req = io_put_req_find_next(req);
	return req ? &req->work : NULL;
}

4748
static void io_wq_submit_work(struct io_wq_work *work)
J
Jens Axboe 已提交
4749 4750
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
J
Jens Axboe 已提交
4751
	const struct io_op_def *def = &io_op_defs[req->opcode];
4752 4753
	unsigned int issue_flags = IO_URING_F_UNLOCKED;
	bool needs_poll = false;
J
Jens Axboe 已提交
4754
	int ret = 0, err = -ECANCELED;
J
Jens Axboe 已提交
4755

4756 4757 4758 4759 4760
	/* one will be dropped by ->io_free_work() after returning to io-wq */
	if (!(req->flags & REQ_F_REFCOUNT))
		__io_req_set_refcount(req, 2);
	else
		req_ref_get(req);
4761

4762
	io_arm_ltimeout(req);
J
Jens Axboe 已提交
4763

4764
	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
4765
	if (work->flags & IO_WQ_WORK_CANCEL) {
4766
fail:
J
Jens Axboe 已提交
4767
		io_req_task_queue_fail(req, err);
4768 4769
		return;
	}
4770 4771 4772 4773 4774
	if (!io_assign_file(req, issue_flags)) {
		err = -EBADF;
		work->flags |= IO_WQ_WORK_CANCEL;
		goto fail;
	}
4775

4776
	if (req->flags & REQ_F_FORCE_ASYNC) {
4777 4778 4779 4780
		bool opcode_poll = def->pollin || def->pollout;

		if (opcode_poll && file_can_poll(req->file)) {
			needs_poll = true;
4781
			issue_flags |= IO_URING_F_NONBLOCK;
4782
		}
4783
	}
4784

4785 4786 4787 4788 4789 4790 4791 4792 4793 4794
	do {
		ret = io_issue_sqe(req, issue_flags);
		if (ret != -EAGAIN)
			break;
		/*
		 * We can get EAGAIN for iopolled IO even though we're
		 * forcing a sync submission from here, since we can't
		 * wait for request slots on the block side.
		 */
		if (!needs_poll) {
4795 4796
			if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
				break;
4797 4798
			cond_resched();
			continue;
4799 4800
		}

4801
		if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
4802 4803 4804 4805 4806
			return;
		/* aborted or ready, in either case retry blocking */
		needs_poll = false;
		issue_flags &= ~IO_URING_F_NONBLOCK;
	} while (1);
4807

4808
	/* avoid locking problems by failing it from a clean context */
4809
	if (ret < 0)
4810
		io_req_task_queue_fail(req, ret);
J
Jens Axboe 已提交
4811 4812
}

4813 4814 4815
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
					      int index)
{
4816
	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
4817

4818
	return (struct file *) (slot->file_ptr & FFS_MASK);
4819 4820
}

4821
static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
4822 4823 4824
{
	unsigned long file_ptr = (unsigned long) file;

4825
	file_ptr |= io_file_get_flags(file);
4826
	file_slot->file_ptr = file_ptr;
4827 4828
}

4829 4830
inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
				      unsigned int issue_flags)
J
Jens Axboe 已提交
4831
{
4832 4833
	struct io_ring_ctx *ctx = req->ctx;
	struct file *file = NULL;
4834
	unsigned long file_ptr;
J
Jens Axboe 已提交
4835

4836
	io_ring_submit_lock(ctx, issue_flags);
4837

4838
	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
4839
		goto out;
4840 4841 4842 4843 4844
	fd = array_index_nospec(fd, ctx->nr_user_files);
	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
	file = (struct file *) (file_ptr & FFS_MASK);
	file_ptr &= ~FFS_MASK;
	/* mask in overlapping REQ_F and FFS bits */
4845
	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
4846
	io_req_set_rsrc_node(req, ctx, 0);
4847
	WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
4848
out:
4849
	io_ring_submit_unlock(ctx, issue_flags);
4850 4851
	return file;
}
4852

4853
struct file *io_file_get_normal(struct io_kiocb *req, int fd)
4854
{
4855
	struct file *file = fget(fd);
4856

4857
	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
J
Jens Axboe 已提交
4858

4859
	/* we don't allow fixed io_uring files */
4860
	if (file && io_is_uring_fops(file))
4861
		io_req_track_inflight(req);
P
Pavel Begunkov 已提交
4862
	return file;
J
Jens Axboe 已提交
4863 4864
}

4865
static void io_queue_async(struct io_kiocb *req, int ret)
4866 4867
	__must_hold(&req->ctx->uring_lock)
{
4868 4869 4870 4871 4872 4873 4874 4875
	struct io_kiocb *linked_timeout;

	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
		io_req_complete_failed(req, ret);
		return;
	}

	linked_timeout = io_prep_linked_timeout(req);
4876

4877
	switch (io_arm_poll_handler(req, 0)) {
4878 4879 4880 4881 4882 4883 4884 4885
	case IO_APOLL_READY:
		io_req_task_queue(req);
		break;
	case IO_APOLL_ABORTED:
		/*
		 * Queued up for async execution, worker will release
		 * submit reference when the iocb is actually submitted.
		 */
4886
		io_kbuf_recycle(req, 0);
4887
		io_queue_iowq(req, NULL);
4888
		break;
4889 4890
	case IO_APOLL_OK:
		break;
4891 4892 4893 4894 4895 4896
	}

	if (linked_timeout)
		io_queue_linked_timeout(linked_timeout);
}

P
Pavel Begunkov 已提交
4897
static inline void io_queue_sqe(struct io_kiocb *req)
4898
	__must_hold(&req->ctx->uring_lock)
J
Jens Axboe 已提交
4899
{
4900
	int ret;
J
Jens Axboe 已提交
4901

4902
	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
4903

4904 4905
	if (req->flags & REQ_F_COMPLETE_INLINE) {
		io_req_add_compl_list(req);
4906
		return;
4907
	}
4908 4909 4910 4911
	/*
	 * We async punt it if the file wasn't marked NOWAIT, or if the file
	 * doesn't support non-blocking read/write attempts
	 */
4912
	if (likely(!ret))
4913
		io_arm_ltimeout(req);
4914 4915
	else
		io_queue_async(req, ret);
J
Jens Axboe 已提交
4916 4917
}

4918
static void io_queue_sqe_fallback(struct io_kiocb *req)
4919
	__must_hold(&req->ctx->uring_lock)
4920
{
4921 4922 4923 4924 4925 4926 4927 4928
	if (unlikely(req->flags & REQ_F_FAIL)) {
		/*
		 * We don't submit, fail them all, for that replace hardlinks
		 * with normal links. Extra REQ_F_LINK is tolerated.
		 */
		req->flags &= ~REQ_F_HARDLINK;
		req->flags |= REQ_F_LINK;
		io_req_complete_failed(req, req->cqe.res);
4929 4930
	} else if (unlikely(req->ctx->drain_active)) {
		io_drain_req(req);
4931 4932 4933 4934 4935 4936
	} else {
		int ret = io_req_prep_async(req);

		if (unlikely(ret))
			io_req_complete_failed(req, ret);
		else
4937
			io_queue_iowq(req, NULL);
J
Jens Axboe 已提交
4938
	}
4939 4940
}

4941 4942 4943 4944 4945 4946 4947 4948
/*
 * Check SQE restrictions (opcode and flags).
 *
 * Returns 'true' if SQE is allowed, 'false' otherwise.
 */
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
					struct io_kiocb *req,
					unsigned int sqe_flags)
4949
{
4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961
	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
		return false;

	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
	    ctx->restrictions.sqe_flags_required)
		return false;

	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
			  ctx->restrictions.sqe_flags_required))
		return false;

	return true;
4962 4963
}

4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974
static void io_init_req_drain(struct io_kiocb *req)
{
	struct io_ring_ctx *ctx = req->ctx;
	struct io_kiocb *head = ctx->submit_state.link.head;

	ctx->drain_active = true;
	if (head) {
		/*
		 * If we need to drain a request in the middle of a link, drain
		 * the head request and the next request/link after the current
		 * link. Considering sequential execution of links,
4975
		 * REQ_F_IO_DRAIN will be maintained for every request of our
4976 4977
		 * link.
		 */
4978
		head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
4979 4980 4981 4982
		ctx->drain_next = true;
	}
}

4983 4984
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
		       const struct io_uring_sqe *sqe)
4985
	__must_hold(&ctx->uring_lock)
4986
{
4987
	const struct io_op_def *def;
4988
	unsigned int sqe_flags;
4989
	int personality;
4990
	u8 opcode;
4991

P
Pavel Begunkov 已提交
4992
	/* req is partially pre-initialised, see io_preinit_req() */
4993
	req->opcode = opcode = READ_ONCE(sqe->opcode);
4994 4995
	/* same numerical values with corresponding REQ_F_*, safe to copy */
	req->flags = sqe_flags = READ_ONCE(sqe->flags);
4996
	req->cqe.user_data = READ_ONCE(sqe->user_data);
4997
	req->file = NULL;
4998
	req->rsrc_node = NULL;
4999 5000
	req->task = current;

5001 5002
	if (unlikely(opcode >= IORING_OP_LAST)) {
		req->opcode = 0;
5003
		return -EINVAL;
5004
	}
5005
	def = &io_op_defs[opcode];
5006 5007 5008 5009
	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
		/* enforce forwards compatibility on users */
		if (sqe_flags & ~SQE_VALID_FLAGS)
			return -EINVAL;
5010
		if (sqe_flags & IOSQE_BUFFER_SELECT) {
5011
			if (!def->buffer_select)
5012 5013 5014
				return -EOPNOTSUPP;
			req->buf_index = READ_ONCE(sqe->buf_group);
		}
5015 5016 5017 5018 5019
		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
			ctx->drain_disabled = true;
		if (sqe_flags & IOSQE_IO_DRAIN) {
			if (ctx->drain_disabled)
				return -EOPNOTSUPP;
5020
			io_init_req_drain(req);
5021
		}
5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032
	}
	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
			return -EACCES;
		/* knock it to the slow queue path, will be drained there */
		if (ctx->drain_active)
			req->flags |= REQ_F_FORCE_ASYNC;
		/* if there is no link, we're at "next" request and need to drain */
		if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
			ctx->drain_next = false;
			ctx->drain_active = true;
5033
			req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
5034
		}
5035
	}
5036

5037
	if (!def->ioprio && sqe->ioprio)
5038
		return -EINVAL;
5039
	if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
5040 5041
		return -EINVAL;

5042
	if (def->needs_file) {
P
Pavel Begunkov 已提交
5043 5044
		struct io_submit_state *state = &ctx->submit_state;

5045
		req->cqe.fd = READ_ONCE(sqe->fd);
J
Jens Axboe 已提交
5046

P
Pavel Begunkov 已提交
5047 5048 5049 5050
		/*
		 * Plug now if we have more than 2 IO left after this, and the
		 * target is potentially a read/write to block based storage.
		 */
5051
		if (state->need_plug && def->plug) {
P
Pavel Begunkov 已提交
5052 5053
			state->plug_started = true;
			state->need_plug = false;
5054
			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
P
Pavel Begunkov 已提交
5055
		}
5056
	}
5057

5058 5059
	personality = READ_ONCE(sqe->personality);
	if (personality) {
5060 5061
		int ret;

5062 5063
		req->creds = xa_load(&ctx->personalities, personality);
		if (!req->creds)
5064
			return -EINVAL;
5065
		get_cred(req->creds);
5066 5067 5068 5069 5070
		ret = security_uring_override_creds(req->creds);
		if (ret) {
			put_cred(req->creds);
			return ret;
		}
5071
		req->flags |= REQ_F_CREDS;
5072
	}
5073

5074
	return def->prep(req, sqe);
5075 5076
}

5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114
static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
				      struct io_kiocb *req, int ret)
{
	struct io_ring_ctx *ctx = req->ctx;
	struct io_submit_link *link = &ctx->submit_state.link;
	struct io_kiocb *head = link->head;

	trace_io_uring_req_failed(sqe, ctx, req, ret);

	/*
	 * Avoid breaking links in the middle as it renders links with SQPOLL
	 * unusable. Instead of failing eagerly, continue assembling the link if
	 * applicable and mark the head with REQ_F_FAIL. The link flushing code
	 * should find the flag and handle the rest.
	 */
	req_fail_link_node(req, ret);
	if (head && !(head->flags & REQ_F_FAIL))
		req_fail_link_node(head, -ECANCELED);

	if (!(req->flags & IO_REQ_LINK_FLAGS)) {
		if (head) {
			link->last->link = req;
			link->head = NULL;
			req = head;
		}
		io_queue_sqe_fallback(req);
		return ret;
	}

	if (head)
		link->last->link = req;
	else
		link->head = req;
	link->last = req;
	return 0;
}

static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
5115
			 const struct io_uring_sqe *sqe)
5116
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
5117
{
5118
	struct io_submit_link *link = &ctx->submit_state.link;
5119
	int ret;
J
Jens Axboe 已提交
5120

5121
	ret = io_init_req(ctx, req, sqe);
5122 5123
	if (unlikely(ret))
		return io_submit_fail_init(sqe, req, ret);
5124

5125
	/* don't need @sqe from now on */
5126
	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
5127 5128
				  req->flags, true,
				  ctx->flags & IORING_SETUP_SQPOLL);
5129

J
Jens Axboe 已提交
5130 5131 5132 5133 5134 5135 5136
	/*
	 * If we already have a head request, queue this one for async
	 * submittal once the head completes. If we don't have a head but
	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
	 * submitted sync once the chain is complete. If none of those
	 * conditions are true (normal request), then just queue it.
	 */
5137
	if (unlikely(link->head)) {
5138 5139 5140 5141 5142
		ret = io_req_prep_async(req);
		if (unlikely(ret))
			return io_submit_fail_init(sqe, req, ret);

		trace_io_uring_link(ctx, req, link->head);
5143
		link->last->link = req;
5144
		link->last = req;
5145

5146
		if (req->flags & IO_REQ_LINK_FLAGS)
5147
			return 0;
5148 5149
		/* last request of the link, flush it */
		req = link->head;
5150
		link->head = NULL;
5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162
		if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
			goto fallback;

	} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
					  REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
		if (req->flags & IO_REQ_LINK_FLAGS) {
			link->head = req;
			link->last = req;
		} else {
fallback:
			io_queue_sqe_fallback(req);
		}
5163
		return 0;
J
Jens Axboe 已提交
5164
	}
5165

5166
	io_queue_sqe(req);
5167
	return 0;
J
Jens Axboe 已提交
5168 5169
}

5170 5171 5172
/*
 * Batched submission is done, ensure local IO is flushed out.
 */
5173
static void io_submit_state_end(struct io_ring_ctx *ctx)
5174
{
5175 5176
	struct io_submit_state *state = &ctx->submit_state;

5177 5178
	if (unlikely(state->link.head))
		io_queue_sqe_fallback(state->link.head);
5179
	/* flush only after queuing links as they can generate completions */
5180
	io_submit_flush_completions(ctx);
J
Jens Axboe 已提交
5181 5182
	if (state->plug_started)
		blk_finish_plug(&state->plug);
5183 5184 5185 5186 5187 5188
}

/*
 * Start submission side cache.
 */
static void io_submit_state_start(struct io_submit_state *state,
5189
				  unsigned int max_ios)
5190
{
J
Jens Axboe 已提交
5191
	state->plug_started = false;
P
Pavel Begunkov 已提交
5192
	state->need_plug = max_ios > 2;
5193
	state->submit_nr = max_ios;
5194 5195
	/* set only head, no need to init link_last in advance */
	state->link.head = NULL;
5196 5197
}

J
Jens Axboe 已提交
5198 5199
static void io_commit_sqring(struct io_ring_ctx *ctx)
{
5200
	struct io_rings *rings = ctx->rings;
J
Jens Axboe 已提交
5201

5202 5203 5204 5205 5206 5207
	/*
	 * Ensure any loads from the SQEs are done at this point,
	 * since once we write the new head, the application could
	 * write new data to them.
	 */
	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
J
Jens Axboe 已提交
5208 5209 5210
}

/*
F
Fam Zheng 已提交
5211
 * Fetch an sqe, if one is available. Note this returns a pointer to memory
J
Jens Axboe 已提交
5212 5213 5214 5215 5216 5217
 * that is mapped by userspace. This means that care needs to be taken to
 * ensure that reads are stable, as we cannot rely on userspace always
 * being a good citizen. If members of the sqe are validated and then later
 * used, it's important that those reads are done through READ_ONCE() to
 * prevent a re-load down the line.
 */
5218
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
5219
{
5220
	unsigned head, mask = ctx->sq_entries - 1;
P
Pavel Begunkov 已提交
5221
	unsigned sq_idx = ctx->cached_sq_head++ & mask;
J
Jens Axboe 已提交
5222 5223 5224 5225 5226 5227 5228 5229 5230

	/*
	 * The cached sq head (or cq tail) serves two purposes:
	 *
	 * 1) allows us to batch the cost of updating the user visible
	 *    head updates.
	 * 2) allows the kernel side to track the head on its own, even
	 *    though the application is the one updating it.
	 */
P
Pavel Begunkov 已提交
5231
	head = READ_ONCE(ctx->sq_array[sq_idx]);
5232 5233 5234 5235
	if (likely(head < ctx->sq_entries)) {
		/* double index for 128-byte SQEs, twice as long */
		if (ctx->flags & IORING_SETUP_SQE128)
			head <<= 1;
5236
		return &ctx->sq_sqes[head];
5237
	}
J
Jens Axboe 已提交
5238 5239

	/* drop invalid entries */
5240 5241 5242
	ctx->cq_extra--;
	WRITE_ONCE(ctx->rings->sq_dropped,
		   READ_ONCE(ctx->rings->sq_dropped) + 1);
5243 5244 5245
	return NULL;
}

5246
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
5247
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
5248
{
5249
	unsigned int entries = io_sqring_entries(ctx);
5250 5251
	unsigned int left;
	int ret;
J
Jens Axboe 已提交
5252

5253
	if (unlikely(!entries))
5254
		return 0;
5255
	/* make sure SQ entry isn't read before tail */
5256 5257 5258
	ret = left = min3(nr, ctx->sq_entries, entries);
	io_get_task_refs(left);
	io_submit_state_start(&ctx->submit_state, left);
J
Jens Axboe 已提交
5259

5260
	do {
5261
		const struct io_uring_sqe *sqe;
5262
		struct io_kiocb *req;
5263

5264
		if (unlikely(!io_alloc_req_refill(ctx)))
5265
			break;
5266
		req = io_alloc_req(ctx);
5267 5268
		sqe = io_get_sqe(ctx);
		if (unlikely(!sqe)) {
5269
			io_req_add_to_cache(req, ctx);
5270 5271
			break;
		}
J
Jens Axboe 已提交
5272

5273 5274 5275 5276 5277 5278 5279 5280
		/*
		 * Continue submitting even for sqe failure if the
		 * ring was setup with IORING_SETUP_SUBMIT_ALL
		 */
		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
			left--;
			break;
5281
		}
5282
	} while (--left);
5283

5284 5285 5286 5287 5288 5289
	if (unlikely(left)) {
		ret -= left;
		/* try again if it submitted nothing and can't allocate a req */
		if (!ret && io_req_cache_empty(ctx))
			ret = -EAGAIN;
		current->io_uring->cached_refs += left;
5290
	}
J
Jens Axboe 已提交
5291

5292
	io_submit_state_end(ctx);
5293 5294
	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
	io_commit_sqring(ctx);
5295
	return ret;
J
Jens Axboe 已提交
5296 5297
}

5298 5299 5300
struct io_wait_queue {
	struct wait_queue_entry wq;
	struct io_ring_ctx *ctx;
5301
	unsigned cq_tail;
5302 5303 5304
	unsigned nr_timeouts;
};

5305
static inline bool io_should_wake(struct io_wait_queue *iowq)
5306 5307
{
	struct io_ring_ctx *ctx = iowq->ctx;
5308
	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
5309 5310

	/*
5311
	 * Wake up if we have enough events, or if a timeout occurred since we
5312 5313 5314
	 * started waiting. For timeouts, we always want to return to userspace,
	 * regardless of event count.
	 */
5315
	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
5316 5317 5318 5319 5320 5321 5322 5323
}

static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
			    int wake_flags, void *key)
{
	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
							wq);

5324 5325 5326 5327
	/*
	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
	 * the task, and the next invocation will do it.
	 */
5328 5329
	if (io_should_wake(iowq) ||
	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
5330 5331
		return autoremove_wake_function(curr, mode, wake_flags, key);
	return -1;
5332 5333
}

5334 5335 5336 5337
static int io_run_task_work_sig(void)
{
	if (io_run_task_work())
		return 1;
5338
	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
5339
		return -ERESTARTSYS;
5340 5341 5342
	if (task_sigpending(current))
		return -EINTR;
	return 0;
5343 5344
}

5345 5346 5347
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
					  struct io_wait_queue *iowq,
5348
					  ktime_t timeout)
5349 5350
{
	int ret;
5351
	unsigned long check_cq;
5352 5353 5354 5355 5356

	/* make sure we run task_work before checking for signals */
	ret = io_run_task_work_sig();
	if (ret || io_should_wake(iowq))
		return ret;
5357
	check_cq = READ_ONCE(ctx->check_cq);
5358
	/* let the caller flush overflows, retry */
5359
	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
5360
		return 1;
5361 5362
	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
		return -EBADR;
5363 5364 5365
	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
		return -ETIME;
	return 1;
5366 5367
}

J
Jens Axboe 已提交
5368 5369 5370 5371 5372
/*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
 */
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
5373 5374
			  const sigset_t __user *sig, size_t sigsz,
			  struct __kernel_timespec __user *uts)
J
Jens Axboe 已提交
5375
{
5376
	struct io_wait_queue iowq;
5377
	struct io_rings *rings = ctx->rings;
5378
	ktime_t timeout = KTIME_MAX;
5379
	int ret;
J
Jens Axboe 已提交
5380

5381
	do {
5382
		io_cqring_overflow_flush(ctx);
5383
		if (io_cqring_events(ctx) >= min_events)
5384
			return 0;
5385
		if (!io_run_task_work())
5386 5387
			break;
	} while (1);
J
Jens Axboe 已提交
5388 5389

	if (sig) {
5390 5391 5392
#ifdef CONFIG_COMPAT
		if (in_compat_syscall())
			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
5393
						      sigsz);
5394 5395
		else
#endif
5396
			ret = set_user_sigmask(sig, sigsz);
5397

J
Jens Axboe 已提交
5398 5399 5400 5401
		if (ret)
			return ret;
	}

5402 5403 5404 5405 5406 5407 5408 5409
	if (uts) {
		struct timespec64 ts;

		if (get_timespec64(&ts, uts))
			return -EFAULT;
		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
	}

5410 5411 5412 5413
	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
	iowq.wq.private = current;
	INIT_LIST_HEAD(&iowq.wq.entry);
	iowq.ctx = ctx;
5414
	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
5415
	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
5416

5417
	trace_io_uring_cqring_wait(ctx, min_events);
5418
	do {
5419
		/* if we can't even flush overflow, don't wait for more */
5420
		if (!io_cqring_overflow_flush(ctx)) {
5421 5422 5423
			ret = -EBUSY;
			break;
		}
P
Pavel Begunkov 已提交
5424
		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
5425
						TASK_INTERRUPTIBLE);
5426
		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
5427
		cond_resched();
5428
	} while (ret > 0);
5429

5430
	finish_wait(&ctx->cq_wait, &iowq.wq);
5431
	restore_saved_sigmask_unless(ret == -EINTR);
J
Jens Axboe 已提交
5432

5433
	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
J
Jens Axboe 已提交
5434 5435
}

5436
static void io_free_page_table(void **table, size_t size)
5437
{
5438
	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
5439

5440
	for (i = 0; i < nr_tables; i++)
5441 5442 5443 5444
		kfree(table[i]);
	kfree(table);
}

P
Pavel Begunkov 已提交
5445
static __cold void **io_alloc_page_table(size_t size)
5446 5447 5448 5449 5450
{
	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
	size_t init_size = size;
	void **table;

5451
	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
5452 5453 5454 5455
	if (!table)
		return NULL;

	for (i = 0; i < nr_tables; i++) {
5456
		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
5457

5458
		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
5459 5460 5461 5462 5463 5464 5465
		if (!table[i]) {
			io_free_page_table(table, init_size);
			return NULL;
		}
		size -= this_size;
	}
	return table;
5466 5467
}

5468
static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
5469
{
5470 5471
	percpu_ref_exit(&ref_node->refs);
	kfree(ref_node);
5472 5473
}

P
Pavel Begunkov 已提交
5474
static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
5475 5476 5477 5478 5479
{
	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
	unsigned long flags;
	bool first_add = false;
5480
	unsigned long delay = HZ;
5481 5482 5483 5484

	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
	node->done = true;

5485 5486 5487 5488
	/* if we are mid-quiesce then do not delay */
	if (node->rsrc_data->quiesce)
		delay = 0;

5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500
	while (!list_empty(&ctx->rsrc_ref_list)) {
		node = list_first_entry(&ctx->rsrc_ref_list,
					    struct io_rsrc_node, node);
		/* recycle ref nodes in order */
		if (!node->done)
			break;
		list_del(&node->node);
		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
	}
	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);

	if (first_add)
5501
		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
5502 5503
}

5504
static struct io_rsrc_node *io_rsrc_node_alloc(void)
5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522
{
	struct io_rsrc_node *ref_node;

	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
	if (!ref_node)
		return NULL;

	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
			    0, GFP_KERNEL)) {
		kfree(ref_node);
		return NULL;
	}
	INIT_LIST_HEAD(&ref_node->node);
	INIT_LIST_HEAD(&ref_node->rsrc_list);
	ref_node->done = false;
	return ref_node;
}

5523 5524
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
			 struct io_rsrc_data *data_to_kill)
5525
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
5526
{
P
Pavel Begunkov 已提交
5527 5528
	WARN_ON_ONCE(!ctx->rsrc_backup_node);
	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
J
Jens Axboe 已提交
5529

5530 5531
	io_rsrc_refs_drop(ctx);

P
Pavel Begunkov 已提交
5532 5533
	if (data_to_kill) {
		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
5534

P
Pavel Begunkov 已提交
5535
		rsrc_node->rsrc_data = data_to_kill;
5536
		spin_lock_irq(&ctx->rsrc_ref_lock);
P
Pavel Begunkov 已提交
5537
		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
5538
		spin_unlock_irq(&ctx->rsrc_ref_lock);
5539

5540
		atomic_inc(&data_to_kill->refs);
P
Pavel Begunkov 已提交
5541 5542 5543
		percpu_ref_kill(&rsrc_node->refs);
		ctx->rsrc_node = NULL;
	}
J
Jens Axboe 已提交
5544

P
Pavel Begunkov 已提交
5545 5546 5547 5548
	if (!ctx->rsrc_node) {
		ctx->rsrc_node = ctx->rsrc_backup_node;
		ctx->rsrc_backup_node = NULL;
	}
5549 5550
}

5551
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
5552 5553 5554
{
	if (ctx->rsrc_backup_node)
		return 0;
5555
	ctx->rsrc_backup_node = io_rsrc_node_alloc();
5556
	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
5557 5558
}

P
Pavel Begunkov 已提交
5559 5560
static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
				      struct io_ring_ctx *ctx)
5561 5562
{
	int ret;
5563

5564
	/* As we may drop ->uring_lock, other task may have started quiesce */
5565 5566
	if (data->quiesce)
		return -ENXIO;
5567

5568
	data->quiesce = true;
5569
	do {
P
Pavel Begunkov 已提交
5570
		ret = io_rsrc_node_switch_start(ctx);
5571
		if (ret)
5572
			break;
P
Pavel Begunkov 已提交
5573
		io_rsrc_node_switch(ctx, data);
5574

5575 5576 5577
		/* kill initial ref, already quiesced if zero */
		if (atomic_dec_and_test(&data->refs))
			break;
5578
		mutex_unlock(&ctx->uring_lock);
5579
		flush_delayed_work(&ctx->rsrc_put_work);
5580
		ret = wait_for_completion_interruptible(&data->done);
5581 5582
		if (!ret) {
			mutex_lock(&ctx->uring_lock);
5583 5584 5585 5586 5587 5588 5589 5590 5591
			if (atomic_read(&data->refs) > 0) {
				/*
				 * it has been revived by another thread while
				 * we were unlocked
				 */
				mutex_unlock(&ctx->uring_lock);
			} else {
				break;
			}
5592
		}
5593

5594 5595 5596
		atomic_inc(&data->refs);
		/* wait for all works potentially completing data->done */
		flush_delayed_work(&ctx->rsrc_put_work);
5597
		reinit_completion(&data->done);
5598

5599
		ret = io_run_task_work_sig();
5600
		mutex_lock(&ctx->uring_lock);
5601
	} while (ret >= 0);
5602
	data->quiesce = false;
5603

5604
	return ret;
5605 5606
}

5607 5608 5609 5610 5611 5612 5613 5614
static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
{
	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;

	return &data->tags[table_idx][off];
}

5615
static void io_rsrc_data_free(struct io_rsrc_data *data)
5616
{
5617 5618 5619 5620
	size_t size = data->nr * sizeof(data->tags[0][0]);

	if (data->tags)
		io_free_page_table((void **)data->tags, size);
5621 5622 5623
	kfree(data);
}

P
Pavel Begunkov 已提交
5624 5625 5626
static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
				     u64 __user *utags, unsigned nr,
				     struct io_rsrc_data **pdata)
5627
{
5628
	struct io_rsrc_data *data;
5629
	int ret = -ENOMEM;
5630
	unsigned i;
5631 5632 5633

	data = kzalloc(sizeof(*data), GFP_KERNEL);
	if (!data)
5634
		return -ENOMEM;
5635
	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
5636
	if (!data->tags) {
5637
		kfree(data);
5638 5639
		return -ENOMEM;
	}
5640 5641 5642 5643

	data->nr = nr;
	data->ctx = ctx;
	data->do_put = do_put;
5644
	if (utags) {
5645
		ret = -EFAULT;
5646
		for (i = 0; i < nr; i++) {
5647 5648 5649 5650
			u64 *tag_slot = io_get_tag_slot(data, i);

			if (copy_from_user(tag_slot, &utags[i],
					   sizeof(*tag_slot)))
5651
				goto fail;
5652
		}
5653
	}
5654

5655
	atomic_set(&data->refs, 1);
5656
	init_completion(&data->done);
5657 5658
	*pdata = data;
	return 0;
5659 5660 5661
fail:
	io_rsrc_data_free(data);
	return ret;
5662 5663
}

5664
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
5665
{
5666
#if !defined(IO_URING_SCM_ALL)
5667 5668 5669 5670 5671
	int i;

	for (i = 0; i < ctx->nr_user_files; i++) {
		struct file *file = io_file_from_index(ctx, i);

5672 5673 5674
		if (!file)
			continue;
		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
5675
			continue;
5676
		io_file_bitmap_clear(&ctx->file_table, i);
5677 5678
		fput(file);
	}
5679
#endif
5680

5681 5682 5683 5684 5685 5686 5687 5688 5689
#if defined(CONFIG_UNIX)
	if (ctx->ring_sock) {
		struct sock *sock = ctx->ring_sock->sk;
		struct sk_buff *skb;

		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
			kfree_skb(skb);
	}
#endif
5690
	io_free_file_tables(&ctx->file_table);
5691
	io_rsrc_data_free(ctx->file_data);
5692 5693
	ctx->file_data = NULL;
	ctx->nr_user_files = 0;
5694 5695
}

5696 5697
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
5698
	unsigned nr = ctx->nr_user_files;
5699 5700
	int ret;

5701
	if (!ctx->file_data)
5702
		return -ENXIO;
5703 5704 5705 5706 5707 5708

	/*
	 * Quiesce may unlock ->uring_lock, and while it's not held
	 * prevent new requests using the table.
	 */
	ctx->nr_user_files = 0;
5709
	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
5710
	ctx->nr_user_files = nr;
5711 5712 5713
	if (!ret)
		__io_sqe_files_unregister(ctx);
	return ret;
J
Jens Axboe 已提交
5714 5715 5716 5717 5718
}

/*
 * Ensure the UNIX gc is aware of our file set, so we are certain that
 * the io_uring can be safely unregistered on process exit, even if we have
5719 5720 5721
 * loops in the file referencing. We account only files that can hold other
 * files because otherwise they can't form a loop and so are not interesting
 * for GC.
J
Jens Axboe 已提交
5722
 */
5723
static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
J
Jens Axboe 已提交
5724
{
5725
#if defined(CONFIG_UNIX)
J
Jens Axboe 已提交
5726
	struct sock *sk = ctx->ring_sock->sk;
5727
	struct sk_buff_head *head = &sk->sk_receive_queue;
J
Jens Axboe 已提交
5728 5729 5730
	struct scm_fp_list *fpl;
	struct sk_buff *skb;

5731 5732
	if (likely(!io_file_need_scm(file)))
		return 0;
J
Jens Axboe 已提交
5733

5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745
	/*
	 * See if we can merge this file into an existing skb SCM_RIGHTS
	 * file set. If there's no room, fall back to allocating a new skb
	 * and filling it in.
	 */
	spin_lock_irq(&head->lock);
	skb = skb_peek(head);
	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
		__skb_unlink(skb, head);
	else
		skb = NULL;
	spin_unlock_irq(&head->lock);
J
Jens Axboe 已提交
5746 5747

	if (!skb) {
5748 5749 5750
		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
		if (!fpl)
			return -ENOMEM;
J
Jens Axboe 已提交
5751

5752 5753 5754 5755 5756
		skb = alloc_skb(0, GFP_KERNEL);
		if (!skb) {
			kfree(fpl);
			return -ENOMEM;
		}
J
Jens Axboe 已提交
5757

5758 5759 5760
		fpl->user = get_uid(current_user());
		fpl->max = SCM_MAX_FD;
		fpl->count = 0;
5761

5762 5763 5764 5765
		UNIXCB(skb).fp = fpl;
		skb->sk = sk;
		skb->destructor = unix_destruct_scm;
		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
J
Jens Axboe 已提交
5766 5767
	}

5768 5769 5770 5771
	fpl = UNIXCB(skb).fp;
	fpl->fp[fpl->count++] = get_file(file);
	unix_inflight(fpl->user, file);
	skb_queue_head(head, skb);
5772
	fput(file);
5773
#endif
J
Jens Axboe 已提交
5774 5775 5776
	return 0;
}

5777
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
5778
{
5779
	struct file *file = prsrc->file;
5780 5781 5782 5783 5784 5785
#if defined(CONFIG_UNIX)
	struct sock *sock = ctx->ring_sock->sk;
	struct sk_buff_head list, *head = &sock->sk_receive_queue;
	struct sk_buff *skb;
	int i;

5786 5787 5788 5789 5790
	if (!io_file_need_scm(file)) {
		fput(file);
		return;
	}

5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844
	__skb_queue_head_init(&list);

	/*
	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
	 * remove this entry and rearrange the file array.
	 */
	skb = skb_dequeue(head);
	while (skb) {
		struct scm_fp_list *fp;

		fp = UNIXCB(skb).fp;
		for (i = 0; i < fp->count; i++) {
			int left;

			if (fp->fp[i] != file)
				continue;

			unix_notinflight(fp->user, fp->fp[i]);
			left = fp->count - 1 - i;
			if (left) {
				memmove(&fp->fp[i], &fp->fp[i + 1],
						left * sizeof(struct file *));
			}
			fp->count--;
			if (!fp->count) {
				kfree_skb(skb);
				skb = NULL;
			} else {
				__skb_queue_tail(&list, skb);
			}
			fput(file);
			file = NULL;
			break;
		}

		if (!file)
			break;

		__skb_queue_tail(&list, skb);

		skb = skb_dequeue(head);
	}

	if (skb_peek(&list)) {
		spin_lock_irq(&head->lock);
		while ((skb = __skb_dequeue(&list)) != NULL)
			__skb_queue_tail(head, skb);
		spin_unlock_irq(&head->lock);
	}
#else
	fput(file);
#endif
}

5845
static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
5846
{
5847
	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
5848 5849
	struct io_ring_ctx *ctx = rsrc_data->ctx;
	struct io_rsrc_put *prsrc, *tmp;
5850

5851 5852
	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
		list_del(&prsrc->list);
5853 5854

		if (prsrc->tag) {
5855 5856
			if (ctx->flags & IORING_SETUP_IOPOLL)
				mutex_lock(&ctx->uring_lock);
5857

5858
			spin_lock(&ctx->completion_lock);
5859
			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
5860
			io_commit_cqring(ctx);
5861
			spin_unlock(&ctx->completion_lock);
5862
			io_cqring_ev_posted(ctx);
5863 5864 5865

			if (ctx->flags & IORING_SETUP_IOPOLL)
				mutex_unlock(&ctx->uring_lock);
5866 5867
		}

5868
		rsrc_data->do_put(ctx, prsrc);
5869
		kfree(prsrc);
5870
	}
5871

5872
	io_rsrc_node_destroy(ref_node);
5873 5874
	if (atomic_dec_and_test(&rsrc_data->refs))
		complete(&rsrc_data->done);
5875
}
5876

5877
static void io_rsrc_put_work(struct work_struct *work)
5878 5879 5880 5881
{
	struct io_ring_ctx *ctx;
	struct llist_node *node;

5882 5883
	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
	node = llist_del_all(&ctx->rsrc_put_llist);
5884 5885

	while (node) {
5886
		struct io_rsrc_node *ref_node;
5887 5888
		struct llist_node *next = node->next;

5889
		ref_node = llist_entry(node, struct io_rsrc_node, llist);
5890
		__io_rsrc_put_work(ref_node);
5891 5892 5893 5894
		node = next;
	}
}

J
Jens Axboe 已提交
5895
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
5896
				 unsigned nr_args, u64 __user *tags)
J
Jens Axboe 已提交
5897 5898
{
	__s32 __user *fds = (__s32 __user *) arg;
5899
	struct file *file;
5900
	int fd, ret;
5901
	unsigned i;
J
Jens Axboe 已提交
5902

5903
	if (ctx->file_data)
J
Jens Axboe 已提交
5904 5905 5906 5907 5908
		return -EBUSY;
	if (!nr_args)
		return -EINVAL;
	if (nr_args > IORING_MAX_FIXED_FILES)
		return -EMFILE;
5909 5910
	if (nr_args > rlimit(RLIMIT_NOFILE))
		return -EMFILE;
P
Pavel Begunkov 已提交
5911
	ret = io_rsrc_node_switch_start(ctx);
5912 5913
	if (ret)
		return ret;
5914 5915 5916 5917
	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
				 &ctx->file_data);
	if (ret)
		return ret;
J
Jens Axboe 已提交
5918

5919 5920 5921 5922 5923
	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
		io_rsrc_data_free(ctx->file_data);
		ctx->file_data = NULL;
		return -ENOMEM;
	}
5924

5925
	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
5926 5927
		struct io_fixed_file *file_slot;

5928
		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
5929
			ret = -EFAULT;
5930
			goto fail;
5931
		}
5932
		/* allow sparse sets */
5933
		if (!fds || fd == -1) {
5934
			ret = -EINVAL;
5935
			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
5936
				goto fail;
5937
			continue;
5938
		}
J
Jens Axboe 已提交
5939

5940
		file = fget(fd);
J
Jens Axboe 已提交
5941
		ret = -EBADF;
5942
		if (unlikely(!file))
5943
			goto fail;
5944

J
Jens Axboe 已提交
5945 5946 5947 5948 5949 5950 5951
		/*
		 * Don't allow io_uring instances to be registered. If UNIX
		 * isn't enabled, then this causes a reference cycle and this
		 * instance can never get freed. If UNIX is enabled we'll
		 * handle it just fine, but there's still no point in allowing
		 * a ring fd as it doesn't support regular read/write anyway.
		 */
5952
		if (io_is_uring_fops(file)) {
5953
			fput(file);
5954
			goto fail;
J
Jens Axboe 已提交
5955
		}
5956
		ret = io_scm_file_account(ctx, file);
5957
		if (ret) {
5958
			fput(file);
5959
			goto fail;
5960
		}
5961 5962
		file_slot = io_fixed_file_slot(&ctx->file_table, i);
		io_fixed_file_set(file_slot, file);
5963
		io_file_bitmap_set(&ctx->file_table, i);
5964 5965
	}

P
Pavel Begunkov 已提交
5966
	io_rsrc_node_switch(ctx, NULL);
5967
	return 0;
5968 5969
fail:
	__io_sqe_files_unregister(ctx);
J
Jens Axboe 已提交
5970
	return ret;
5971 5972
}

5973 5974
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
			  struct io_rsrc_node *node, void *rsrc)
5975
{
5976
	u64 *tag_slot = io_get_tag_slot(data, idx);
5977 5978 5979 5980 5981 5982
	struct io_rsrc_put *prsrc;

	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
	if (!prsrc)
		return -ENOMEM;

5983 5984
	prsrc->tag = *tag_slot;
	*tag_slot = 0;
5985 5986 5987 5988 5989
	prsrc->rsrc = rsrc;
	list_add(&prsrc->list, &node->rsrc_list);
	return 0;
}

5990 5991
int io_install_fixed_file(struct io_kiocb *req, struct file *file,
			  unsigned int issue_flags, u32 slot_index)
5992
	__must_hold(&req->ctx->uring_lock)
5993 5994
{
	struct io_ring_ctx *ctx = req->ctx;
5995
	bool needs_switch = false;
5996
	struct io_fixed_file *file_slot;
5997
	int ret;
5998

5999
	if (io_is_uring_fops(file))
6000
		return -EBADF;
6001
	if (!ctx->file_data)
6002
		return -ENXIO;
6003
	if (slot_index >= ctx->nr_user_files)
6004
		return -EINVAL;
6005 6006 6007

	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021

	if (file_slot->file_ptr) {
		struct file *old_file;

		ret = io_rsrc_node_switch_start(ctx);
		if (ret)
			goto err;

		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
					    ctx->rsrc_node, old_file);
		if (ret)
			goto err;
		file_slot->file_ptr = 0;
6022
		io_file_bitmap_clear(&ctx->file_table, slot_index);
6023 6024
		needs_switch = true;
	}
6025

6026
	ret = io_scm_file_account(ctx, file);
6027 6028 6029
	if (!ret) {
		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
		io_fixed_file_set(file_slot, file);
6030
		io_file_bitmap_set(&ctx->file_table, slot_index);
6031 6032
	}
err:
6033 6034
	if (needs_switch)
		io_rsrc_node_switch(ctx, ctx->file_data);
6035 6036 6037 6038 6039
	if (ret)
		fput(file);
	return ret;
}

6040
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6041
				 struct io_uring_rsrc_update2 *up,
6042 6043
				 unsigned nr_args)
{
6044
	u64 __user *tags = u64_to_user_ptr(up->tags);
6045
	__s32 __user *fds = u64_to_user_ptr(up->data);
6046
	struct io_rsrc_data *data = ctx->file_data;
6047 6048
	struct io_fixed_file *file_slot;
	struct file *file;
6049 6050
	int fd, i, err = 0;
	unsigned int done;
6051
	bool needs_switch = false;
6052

6053 6054 6055
	if (!ctx->file_data)
		return -ENXIO;
	if (up->offset + nr_args > ctx->nr_user_files)
6056 6057
		return -EINVAL;

6058
	for (done = 0; done < nr_args; done++) {
6059 6060 6061 6062
		u64 tag = 0;

		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
6063 6064 6065
			err = -EFAULT;
			break;
		}
6066 6067 6068 6069
		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
			err = -EINVAL;
			break;
		}
6070 6071 6072
		if (fd == IORING_REGISTER_FILES_SKIP)
			continue;

6073
		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
6074
		file_slot = io_fixed_file_slot(&ctx->file_table, i);
6075

6076 6077
		if (file_slot->file_ptr) {
			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
6078
			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
6079 6080
			if (err)
				break;
6081
			file_slot->file_ptr = 0;
6082
			io_file_bitmap_clear(&ctx->file_table, i);
6083
			needs_switch = true;
6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098
		}
		if (fd != -1) {
			file = fget(fd);
			if (!file) {
				err = -EBADF;
				break;
			}
			/*
			 * Don't allow io_uring instances to be registered. If
			 * UNIX isn't enabled, then this causes a reference
			 * cycle and this instance can never get freed. If UNIX
			 * is enabled we'll handle it just fine, but there's
			 * still no point in allowing a ring fd as it doesn't
			 * support regular read/write anyway.
			 */
6099
			if (io_is_uring_fops(file)) {
6100 6101 6102 6103
				fput(file);
				err = -EBADF;
				break;
			}
6104
			err = io_scm_file_account(ctx, file);
6105 6106
			if (err) {
				fput(file);
6107
				break;
6108
			}
6109 6110
			*io_get_tag_slot(data, i) = tag;
			io_fixed_file_set(file_slot, file);
6111
			io_file_bitmap_set(&ctx->file_table, i);
6112
		}
6113 6114
	}

P
Pavel Begunkov 已提交
6115 6116
	if (needs_switch)
		io_rsrc_node_switch(ctx, data);
6117 6118
	return done ? done : err;
}
6119

6120 6121
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
					struct task_struct *task)
6122
{
6123
	struct io_wq_hash *hash;
6124 6125 6126
	struct io_wq_data data;
	unsigned int concurrency;

6127
	mutex_lock(&ctx->uring_lock);
6128 6129 6130
	hash = ctx->hash_map;
	if (!hash) {
		hash = kzalloc(sizeof(*hash), GFP_KERNEL);
6131 6132
		if (!hash) {
			mutex_unlock(&ctx->uring_lock);
6133
			return ERR_PTR(-ENOMEM);
6134
		}
6135 6136 6137
		refcount_set(&hash->refs, 1);
		init_waitqueue_head(&hash->wait);
		ctx->hash_map = hash;
6138
	}
6139
	mutex_unlock(&ctx->uring_lock);
6140

6141
	data.hash = hash;
6142
	data.task = task;
P
Pavel Begunkov 已提交
6143
	data.free_work = io_wq_free_work;
6144
	data.do_work = io_wq_submit_work;
6145

J
Jens Axboe 已提交
6146 6147
	/* Do QD, or 4 * CPUS, whatever is smallest */
	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
6148

6149
	return io_wq_create(concurrency, &data);
6150 6151
}

6152 6153
__cold int io_uring_alloc_task_context(struct task_struct *task,
				       struct io_ring_ctx *ctx)
6154 6155
{
	struct io_uring_task *tctx;
6156
	int ret;
6157

6158
	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
6159 6160 6161
	if (unlikely(!tctx))
		return -ENOMEM;

6162 6163 6164 6165 6166 6167 6168
	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
					 sizeof(struct file *), GFP_KERNEL);
	if (unlikely(!tctx->registered_rings)) {
		kfree(tctx);
		return -ENOMEM;
	}

6169 6170
	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
	if (unlikely(ret)) {
6171
		kfree(tctx->registered_rings);
6172 6173 6174 6175
		kfree(tctx);
		return ret;
	}

6176
	tctx->io_wq = io_init_wq_offload(ctx, task);
6177 6178 6179
	if (IS_ERR(tctx->io_wq)) {
		ret = PTR_ERR(tctx->io_wq);
		percpu_counter_destroy(&tctx->inflight);
6180
		kfree(tctx->registered_rings);
6181 6182 6183 6184
		kfree(tctx);
		return ret;
	}

6185 6186
	xa_init(&tctx->xa);
	init_waitqueue_head(&tctx->wait);
6187
	atomic_set(&tctx->in_idle, 0);
6188
	atomic_set(&tctx->inflight_tracked, 0);
6189
	task->io_uring = tctx;
6190 6191
	spin_lock_init(&tctx->task_lock);
	INIT_WQ_LIST(&tctx->task_list);
6192
	INIT_WQ_LIST(&tctx->prio_task_list);
6193
	init_task_work(&tctx->task_work, tctx_task_work);
6194 6195 6196 6197 6198 6199 6200 6201
	return 0;
}

void __io_uring_free(struct task_struct *tsk)
{
	struct io_uring_task *tctx = tsk->io_uring;

	WARN_ON_ONCE(!xa_empty(&tctx->xa));
6202
	WARN_ON_ONCE(tctx->io_wq);
6203
	WARN_ON_ONCE(tctx->cached_refs);
6204

6205
	kfree(tctx->registered_rings);
6206
	percpu_counter_destroy(&tctx->inflight);
6207 6208 6209 6210
	kfree(tctx);
	tsk->io_uring = NULL;
}

6211 6212
static inline void __io_unaccount_mem(struct user_struct *user,
				      unsigned long nr_pages)
J
Jens Axboe 已提交
6213 6214 6215 6216
{
	atomic_long_sub(nr_pages, &user->locked_vm);
}

6217 6218
static inline int __io_account_mem(struct user_struct *user,
				   unsigned long nr_pages)
J
Jens Axboe 已提交
6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235
{
	unsigned long page_limit, cur_pages, new_pages;

	/* Don't allow more pages than we can safely lock */
	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

	do {
		cur_pages = atomic_long_read(&user->locked_vm);
		new_pages = cur_pages + nr_pages;
		if (new_pages > page_limit)
			return -ENOMEM;
	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
					new_pages) != cur_pages);

	return 0;
}

6236
static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
6237
{
J
Jens Axboe 已提交
6238
	if (ctx->user)
6239
		__io_unaccount_mem(ctx->user, nr_pages);
6240

6241 6242
	if (ctx->mm_account)
		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
6243 6244
}

6245
static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
6246
{
6247 6248
	int ret;

J
Jens Axboe 已提交
6249
	if (ctx->user) {
6250 6251 6252 6253 6254
		ret = __io_account_mem(ctx->user, nr_pages);
		if (ret)
			return ret;
	}

6255 6256
	if (ctx->mm_account)
		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
6257 6258 6259 6260

	return 0;
}

J
Jens Axboe 已提交
6261 6262
static void io_mem_free(void *ptr)
{
6263 6264 6265 6266
	struct page *page;

	if (!ptr)
		return;
J
Jens Axboe 已提交
6267

6268
	page = virt_to_head_page(ptr);
J
Jens Axboe 已提交
6269 6270 6271 6272 6273 6274
	if (put_page_testzero(page))
		free_compound_page(page);
}

static void *io_mem_alloc(size_t size)
{
6275
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
J
Jens Axboe 已提交
6276

6277
	return (void *) __get_free_pages(gfp, get_order(size));
J
Jens Axboe 已提交
6278 6279
}

6280 6281
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
				unsigned int cq_entries, size_t *sq_offset)
6282 6283 6284 6285 6286 6287 6288
{
	struct io_rings *rings;
	size_t off, sq_array_size;

	off = struct_size(rings, cqes, cq_entries);
	if (off == SIZE_MAX)
		return SIZE_MAX;
6289 6290 6291 6292
	if (ctx->flags & IORING_SETUP_CQE32) {
		if (check_shl_overflow(off, 1, &off))
			return SIZE_MAX;
	}
6293 6294 6295 6296 6297 6298 6299

#ifdef CONFIG_SMP
	off = ALIGN(off, SMP_CACHE_BYTES);
	if (off == 0)
		return SIZE_MAX;
#endif

6300 6301 6302
	if (sq_offset)
		*sq_offset = off;

6303 6304 6305 6306 6307 6308 6309 6310 6311 6312
	sq_array_size = array_size(sizeof(u32), sq_entries);
	if (sq_array_size == SIZE_MAX)
		return SIZE_MAX;

	if (check_add_overflow(off, sq_array_size, &off))
		return SIZE_MAX;

	return off;
}

6313
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
6314
{
6315
	struct io_mapped_ubuf *imu = *slot;
6316 6317
	unsigned int i;

6318 6319 6320 6321 6322 6323 6324
	if (imu != ctx->dummy_ubuf) {
		for (i = 0; i < imu->nr_bvecs; i++)
			unpin_user_page(imu->bvec[i].bv_page);
		if (imu->acct_pages)
			io_unaccount_mem(ctx, imu->acct_pages);
		kvfree(imu);
	}
6325
	*slot = NULL;
6326 6327
}

6328
static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
6329
{
6330 6331
	io_buffer_unmap(ctx, &prsrc->buf);
	prsrc->buf = NULL;
6332
}
6333

6334 6335 6336
static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
	unsigned int i;
6337

6338 6339
	for (i = 0; i < ctx->nr_user_bufs; i++)
		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
6340
	kfree(ctx->user_bufs);
6341
	io_rsrc_data_free(ctx->buf_data);
6342
	ctx->user_bufs = NULL;
6343
	ctx->buf_data = NULL;
6344
	ctx->nr_user_bufs = 0;
6345 6346
}

6347
static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
6348
{
6349
	unsigned nr = ctx->nr_user_bufs;
6350
	int ret;
6351

6352
	if (!ctx->buf_data)
6353 6354
		return -ENXIO;

6355 6356 6357 6358 6359
	/*
	 * Quiesce may unlock ->uring_lock, and while it's not held
	 * prevent new requests using the table.
	 */
	ctx->nr_user_bufs = 0;
6360
	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
6361
	ctx->nr_user_bufs = nr;
6362 6363 6364
	if (!ret)
		__io_sqe_buffers_unregister(ctx);
	return ret;
6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380
}

static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
		       void __user *arg, unsigned index)
{
	struct iovec __user *src;

#ifdef CONFIG_COMPAT
	if (ctx->compat) {
		struct compat_iovec __user *ciovs;
		struct compat_iovec ciov;

		ciovs = (struct compat_iovec __user *) arg;
		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
			return -EFAULT;

6381
		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
6382 6383 6384 6385 6386 6387 6388 6389 6390 6391
		dst->iov_len = ciov.iov_len;
		return 0;
	}
#endif
	src = (struct iovec __user *) arg;
	if (copy_from_user(dst, &src[index], sizeof(*dst)))
		return -EFAULT;
	return 0;
}

6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415
/*
 * Not super efficient, but this is just a registration time. And we do cache
 * the last compound head, so generally we'll only do a full search if we don't
 * match that one.
 *
 * We check if the given compound head page has already been accounted, to
 * avoid double accounting it. This allows us to account the full size of the
 * page, not just the constituent pages of a huge page.
 */
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
				  int nr_pages, struct page *hpage)
{
	int i, j;

	/* check current page array */
	for (i = 0; i < nr_pages; i++) {
		if (!PageCompound(pages[i]))
			continue;
		if (compound_head(pages[i]) == hpage)
			return true;
	}

	/* check previously registered pages */
	for (i = 0; i < ctx->nr_user_bufs; i++) {
6416
		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434

		for (j = 0; j < imu->nr_bvecs; j++) {
			if (!PageCompound(imu->bvec[j].bv_page))
				continue;
			if (compound_head(imu->bvec[j].bv_page) == hpage)
				return true;
		}
	}

	return false;
}

static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
				 int nr_pages, struct io_mapped_ubuf *imu,
				 struct page **last_hpage)
{
	int i, ret;

6435
	imu->acct_pages = 0;
6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454
	for (i = 0; i < nr_pages; i++) {
		if (!PageCompound(pages[i])) {
			imu->acct_pages++;
		} else {
			struct page *hpage;

			hpage = compound_head(pages[i]);
			if (hpage == *last_hpage)
				continue;
			*last_hpage = hpage;
			if (headpage_already_acct(ctx, pages, i, hpage))
				continue;
			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
		}
	}

	if (!imu->acct_pages)
		return 0;

6455
	ret = io_account_mem(ctx, imu->acct_pages);
6456 6457 6458 6459 6460
	if (ret)
		imu->acct_pages = 0;
	return ret;
}

J
Jens Axboe 已提交
6461 6462
static struct page **io_pin_pages(unsigned long ubuf, unsigned long len,
				  int *npages)
6463
{
J
Jens Axboe 已提交
6464
	unsigned long start, end, nr_pages;
6465 6466
	struct vm_area_struct **vmas = NULL;
	struct page **pages = NULL;
J
Jens Axboe 已提交
6467
	int i, pret, ret = -ENOMEM;
6468

J
Jens Axboe 已提交
6469
	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480
	start = ubuf >> PAGE_SHIFT;
	nr_pages = end - start;

	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
	if (!pages)
		goto done;

	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
			      GFP_KERNEL);
	if (!vmas)
		goto done;
6481

6482 6483 6484 6485 6486 6487 6488 6489 6490
	ret = 0;
	mmap_read_lock(current->mm);
	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
			      pages, vmas);
	if (pret == nr_pages) {
		/* don't support file backed memory */
		for (i = 0; i < nr_pages; i++) {
			struct vm_area_struct *vma = vmas[i];

6491 6492
			if (vma_is_shmem(vma))
				continue;
6493 6494 6495 6496 6497 6498
			if (vma->vm_file &&
			    !is_file_hugepages(vma->vm_file)) {
				ret = -EOPNOTSUPP;
				break;
			}
		}
J
Jens Axboe 已提交
6499
		*npages = nr_pages;
6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512
	} else {
		ret = pret < 0 ? pret : -EFAULT;
	}
	mmap_read_unlock(current->mm);
	if (ret) {
		/*
		 * if we did partial map, or found file backed vmas,
		 * release any pages we did get
		 */
		if (pret > 0)
			unpin_user_pages(pages, pret);
		goto done;
	}
J
Jens Axboe 已提交
6513 6514 6515 6516 6517 6518 6519 6520 6521
	ret = 0;
done:
	kvfree(vmas);
	if (ret < 0) {
		kvfree(pages);
		pages = ERR_PTR(ret);
	}
	return pages;
}
6522

J
Jens Axboe 已提交
6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
				  struct io_mapped_ubuf **pimu,
				  struct page **last_hpage)
{
	struct io_mapped_ubuf *imu = NULL;
	struct page **pages = NULL;
	unsigned long off;
	size_t size;
	int ret, nr_pages, i;

	if (!iov->iov_base) {
		*pimu = ctx->dummy_ubuf;
		return 0;
	}

	*pimu = NULL;
	ret = -ENOMEM;

	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
				&nr_pages);
	if (IS_ERR(pages)) {
		ret = PTR_ERR(pages);
		pages = NULL;
		goto done;
	}

	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
	if (!imu)
		goto done;
6552

J
Jens Axboe 已提交
6553
	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
6554
	if (ret) {
J
Jens Axboe 已提交
6555
		unpin_user_pages(pages, nr_pages);
6556 6557 6558
		goto done;
	}

J
Jens Axboe 已提交
6559
	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571
	size = iov->iov_len;
	for (i = 0; i < nr_pages; i++) {
		size_t vec_len;

		vec_len = min_t(size_t, size, PAGE_SIZE - off);
		imu->bvec[i].bv_page = pages[i];
		imu->bvec[i].bv_len = vec_len;
		imu->bvec[i].bv_offset = off;
		off = 0;
		size -= vec_len;
	}
	/* store original address for later verification */
J
Jens Axboe 已提交
6572 6573
	imu->ubuf = (unsigned long) iov->iov_base;
	imu->ubuf_end = imu->ubuf + iov->iov_len;
6574
	imu->nr_bvecs = nr_pages;
6575
	*pimu = imu;
6576 6577
	ret = 0;
done:
6578 6579
	if (ret)
		kvfree(imu);
6580 6581 6582 6583
	kvfree(pages);
	return ret;
}

6584
static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
6585
{
6586 6587
	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
	return ctx->user_bufs ? 0 : -ENOMEM;
6588
}
6589

6590 6591
static int io_buffer_validate(struct iovec *iov)
{
6592 6593
	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);

6594 6595 6596 6597 6598
	/*
	 * Don't impose further limits on the size and buffer
	 * constraints here, we'll -EINVAL later when IO is
	 * submitted if they are wrong.
	 */
6599 6600 6601
	if (!iov->iov_base)
		return iov->iov_len ? -EFAULT : 0;
	if (!iov->iov_len)
6602
		return -EFAULT;
6603

6604 6605 6606
	/* arbitrary limit, but we need something */
	if (iov->iov_len > SZ_1G)
		return -EFAULT;
6607

6608 6609 6610
	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
		return -EOVERFLOW;

6611 6612
	return 0;
}
6613

6614
static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
6615
				   unsigned int nr_args, u64 __user *tags)
6616
{
6617 6618
	struct page *last_hpage = NULL;
	struct io_rsrc_data *data;
6619 6620
	int i, ret;
	struct iovec iov;
6621

6622 6623
	if (ctx->user_bufs)
		return -EBUSY;
6624
	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
6625
		return -EINVAL;
6626
	ret = io_rsrc_node_switch_start(ctx);
6627 6628
	if (ret)
		return ret;
6629 6630 6631
	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
	if (ret)
		return ret;
6632 6633
	ret = io_buffers_map_alloc(ctx, nr_args);
	if (ret) {
6634
		io_rsrc_data_free(data);
6635 6636
		return ret;
	}
6637

6638
	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649
		if (arg) {
			ret = io_copy_iov(ctx, &iov, arg, i);
			if (ret)
				break;
			ret = io_buffer_validate(&iov);
			if (ret)
				break;
		} else {
			memset(&iov, 0, sizeof(iov));
		}

6650
		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
6651 6652 6653
			ret = -EINVAL;
			break;
		}
6654

6655 6656
		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
					     &last_hpage);
6657 6658
		if (ret)
			break;
6659
	}
6660

6661
	WARN_ON_ONCE(ctx->buf_data);
6662

6663 6664 6665 6666 6667
	ctx->buf_data = data;
	if (ret)
		__io_sqe_buffers_unregister(ctx);
	else
		io_rsrc_node_switch(ctx, NULL);
6668 6669 6670
	return ret;
}

6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687
static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
				   struct io_uring_rsrc_update2 *up,
				   unsigned int nr_args)
{
	u64 __user *tags = u64_to_user_ptr(up->tags);
	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
	struct page *last_hpage = NULL;
	bool needs_switch = false;
	__u32 done;
	int i, err;

	if (!ctx->buf_data)
		return -ENXIO;
	if (up->offset + nr_args > ctx->nr_user_bufs)
		return -EINVAL;

	for (done = 0; done < nr_args; done++) {
P
Pavel Begunkov 已提交
6688 6689
		struct io_mapped_ubuf *imu;
		int offset = up->offset + done;
6690 6691 6692 6693 6694 6695 6696 6697 6698
		u64 tag = 0;

		err = io_copy_iov(ctx, &iov, iovs, done);
		if (err)
			break;
		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
			err = -EFAULT;
			break;
		}
P
Pavel Begunkov 已提交
6699 6700 6701
		err = io_buffer_validate(&iov);
		if (err)
			break;
6702 6703 6704 6705
		if (!iov.iov_base && tag) {
			err = -EINVAL;
			break;
		}
P
Pavel Begunkov 已提交
6706 6707 6708
		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
		if (err)
			break;
6709

P
Pavel Begunkov 已提交
6710
		i = array_index_nospec(offset, ctx->nr_user_bufs);
6711
		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
6712
			err = io_queue_rsrc_removal(ctx->buf_data, i,
P
Pavel Begunkov 已提交
6713 6714 6715
						    ctx->rsrc_node, ctx->user_bufs[i]);
			if (unlikely(err)) {
				io_buffer_unmap(ctx, &imu);
6716
				break;
P
Pavel Begunkov 已提交
6717
			}
6718 6719 6720 6721
			ctx->user_bufs[i] = NULL;
			needs_switch = true;
		}

P
Pavel Begunkov 已提交
6722
		ctx->user_bufs[i] = imu;
6723
		*io_get_tag_slot(ctx->buf_data, offset) = tag;
6724 6725 6726 6727 6728 6729 6730
	}

	if (needs_switch)
		io_rsrc_node_switch(ctx, ctx->buf_data);
	return done ? done : err;
}

6731 6732
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
			       unsigned int eventfd_async)
6733
{
6734
	struct io_ev_fd *ev_fd;
6735
	__s32 __user *fds = arg;
6736
	int fd;
6737

6738 6739 6740
	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
					lockdep_is_held(&ctx->uring_lock));
	if (ev_fd)
6741 6742 6743 6744 6745
		return -EBUSY;

	if (copy_from_user(&fd, fds, sizeof(*fds)))
		return -EFAULT;

6746 6747 6748
	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
	if (!ev_fd)
		return -ENOMEM;
6749

6750 6751
	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
	if (IS_ERR(ev_fd->cq_ev_fd)) {
6752
		int ret = PTR_ERR(ev_fd->cq_ev_fd);
6753
		kfree(ev_fd);
6754 6755
		return ret;
	}
6756
	ev_fd->eventfd_async = eventfd_async;
6757
	ctx->has_evfd = true;
6758
	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
6759
	return 0;
6760 6761 6762 6763 6764 6765 6766 6767
}

static void io_eventfd_put(struct rcu_head *rcu)
{
	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);

	eventfd_ctx_put(ev_fd->cq_ev_fd);
	kfree(ev_fd);
6768 6769 6770 6771
}

static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
6772 6773 6774 6775 6776
	struct io_ev_fd *ev_fd;

	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
					lockdep_is_held(&ctx->uring_lock));
	if (ev_fd) {
6777
		ctx->has_evfd = false;
6778 6779
		rcu_assign_pointer(ctx->io_ev_fd, NULL);
		call_rcu(&ev_fd->rcu, io_eventfd_put);
6780 6781 6782 6783 6784 6785
		return 0;
	}

	return -ENXIO;
}

6786 6787
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
6788 6789
	struct io_buffer_list *bl;
	unsigned long index;
6790 6791
	int i;

6792 6793 6794 6795 6796
	for (i = 0; i < BGID_ARRAY; i++) {
		if (!ctx->io_bl)
			break;
		__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
	}
6797

6798 6799 6800
	xa_for_each(&ctx->io_bl_xa, index, bl) {
		xa_erase(&ctx->io_bl_xa, bl->bgid);
		__io_remove_buffers(ctx, bl, -1U);
6801
		kfree(bl);
6802
	}
6803 6804 6805 6806 6807 6808 6809 6810

	while (!list_empty(&ctx->io_buffers_pages)) {
		struct page *page;

		page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
		list_del_init(&page->lru);
		__free_page(page);
	}
6811 6812
}

6813
static void io_req_caches_free(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
6814
{
6815
	struct io_submit_state *state = &ctx->submit_state;
6816
	int nr = 0;
P
Pavel Begunkov 已提交
6817

6818
	mutex_lock(&ctx->uring_lock);
6819
	io_flush_cached_locked_reqs(ctx, state);
6820

6821
	while (!io_req_cache_empty(ctx)) {
6822 6823
		struct io_wq_work_node *node;
		struct io_kiocb *req;
6824

6825 6826 6827
		node = wq_stack_extract(&state->free_list);
		req = container_of(node, struct io_kiocb, comp_list);
		kmem_cache_free(req_cachep, req);
6828
		nr++;
6829
	}
6830 6831
	if (nr)
		percpu_ref_put_many(&ctx->refs, nr);
6832 6833 6834
	mutex_unlock(&ctx->uring_lock);
}

6835
static void io_wait_rsrc_data(struct io_rsrc_data *data)
J
Jens Axboe 已提交
6836
{
6837
	if (data && !atomic_dec_and_test(&data->refs))
6838 6839
		wait_for_completion(&data->done);
}
6840

6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852
static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
{
	struct async_poll *apoll;

	while (!list_empty(&ctx->apoll_cache)) {
		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
						poll.wait.entry);
		list_del(&apoll->poll.wait.entry);
		kfree(apoll);
	}
}

P
Pavel Begunkov 已提交
6853
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
6854
{
6855
	io_sq_thread_finish(ctx);
6856

6857
	if (ctx->mm_account) {
6858 6859
		mmdrop(ctx->mm_account);
		ctx->mm_account = NULL;
6860
	}
J
Jens Axboe 已提交
6861

6862
	io_rsrc_refs_drop(ctx);
6863 6864 6865 6866
	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
	io_wait_rsrc_data(ctx->buf_data);
	io_wait_rsrc_data(ctx->file_data);

6867
	mutex_lock(&ctx->uring_lock);
6868
	if (ctx->buf_data)
6869
		__io_sqe_buffers_unregister(ctx);
6870
	if (ctx->file_data)
6871
		__io_sqe_files_unregister(ctx);
6872 6873
	if (ctx->rings)
		__io_cqring_overflow_flush(ctx, true);
6874
	io_eventfd_unregister(ctx);
6875
	io_flush_apoll_cache(ctx);
6876
	mutex_unlock(&ctx->uring_lock);
6877
	io_destroy_buffers(ctx);
6878 6879
	if (ctx->sq_creds)
		put_cred(ctx->sq_creds);
J
Jens Axboe 已提交
6880

P
Pavel Begunkov 已提交
6881 6882 6883
	/* there are no registered resources left, nobody uses it */
	if (ctx->rsrc_node)
		io_rsrc_node_destroy(ctx->rsrc_node);
6884
	if (ctx->rsrc_backup_node)
6885
		io_rsrc_node_destroy(ctx->rsrc_backup_node);
P
Pavel Begunkov 已提交
6886
	flush_delayed_work(&ctx->rsrc_put_work);
6887
	flush_delayed_work(&ctx->fallback_work);
P
Pavel Begunkov 已提交
6888 6889 6890

	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
J
Jens Axboe 已提交
6891

J
Jens Axboe 已提交
6892
#if defined(CONFIG_UNIX)
6893 6894
	if (ctx->ring_sock) {
		ctx->ring_sock->file = NULL; /* so that iput() is called */
J
Jens Axboe 已提交
6895
		sock_release(ctx->ring_sock);
6896
	}
J
Jens Axboe 已提交
6897
#endif
6898
	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
J
Jens Axboe 已提交
6899

6900
	io_mem_free(ctx->rings);
J
Jens Axboe 已提交
6901 6902 6903 6904
	io_mem_free(ctx->sq_sqes);

	percpu_ref_exit(&ctx->refs);
	free_uid(ctx->user);
6905
	io_req_caches_free(ctx);
6906 6907
	if (ctx->hash_map)
		io_wq_put_hash(ctx->hash_map);
6908
	kfree(ctx->cancel_hash);
6909
	kfree(ctx->dummy_ubuf);
6910 6911
	kfree(ctx->io_bl);
	xa_destroy(&ctx->io_bl_xa);
J
Jens Axboe 已提交
6912 6913 6914 6915 6916 6917 6918 6919
	kfree(ctx);
}

static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
	struct io_ring_ctx *ctx = file->private_data;
	__poll_t mask = 0;

6920
	poll_wait(file, &ctx->cq_wait, wait);
6921 6922 6923 6924
	/*
	 * synchronizes with barrier from wq_has_sleeper call in
	 * io_commit_cqring
	 */
J
Jens Axboe 已提交
6925
	smp_rmb();
6926
	if (!io_sqring_full(ctx))
J
Jens Axboe 已提交
6927
		mask |= EPOLLOUT | EPOLLWRNORM;
6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941

	/*
	 * Don't flush cqring overflow list here, just do a simple check.
	 * Otherwise there could possible be ABBA deadlock:
	 *      CPU0                    CPU1
	 *      ----                    ----
	 * lock(&ctx->uring_lock);
	 *                              lock(&ep->mtx);
	 *                              lock(&ctx->uring_lock);
	 * lock(&ep->mtx);
	 *
	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
	 * pushs them to do the flush.
	 */
6942 6943
	if (io_cqring_events(ctx) ||
	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
J
Jens Axboe 已提交
6944 6945 6946 6947 6948
		mask |= EPOLLIN | EPOLLRDNORM;

	return mask;
}

6949
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
6950
{
J
Jens Axboe 已提交
6951
	const struct cred *creds;
6952

6953
	creds = xa_erase(&ctx->personalities, id);
J
Jens Axboe 已提交
6954 6955
	if (creds) {
		put_cred(creds);
6956
		return 0;
J
Jens Axboe 已提交
6957
	}
6958 6959 6960 6961

	return -EINVAL;
}

6962 6963 6964
struct io_tctx_exit {
	struct callback_head		task_work;
	struct completion		completion;
6965
	struct io_ring_ctx		*ctx;
6966 6967
};

P
Pavel Begunkov 已提交
6968
static __cold void io_tctx_exit_cb(struct callback_head *cb)
6969 6970 6971 6972 6973 6974 6975 6976 6977 6978
{
	struct io_uring_task *tctx = current->io_uring;
	struct io_tctx_exit *work;

	work = container_of(cb, struct io_tctx_exit, task_work);
	/*
	 * When @in_idle, we're in cancellation and it's racy to remove the
	 * node. It'll be removed by the end of cancellation, just ignore it.
	 */
	if (!atomic_read(&tctx->in_idle))
6979
		io_uring_del_tctx_node((unsigned long)work->ctx);
6980 6981 6982
	complete(&work->completion);
}

P
Pavel Begunkov 已提交
6983
static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
6984 6985 6986 6987 6988 6989
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);

	return req->ctx == data;
}

P
Pavel Begunkov 已提交
6990
static __cold void io_ring_exit_work(struct work_struct *work)
6991
{
6992
	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
6993
	unsigned long timeout = jiffies + HZ * 60 * 5;
6994
	unsigned long interval = HZ / 20;
6995 6996 6997
	struct io_tctx_exit exit;
	struct io_tctx_node *node;
	int ret;
6998

6999 7000 7001 7002 7003 7004
	/*
	 * If we're doing polled IO and end up having requests being
	 * submitted async (out-of-line), then completions can come in while
	 * we're waiting for refs to drop. We need to reap these manually,
	 * as nobody else will be looking for them.
	 */
7005
	do {
7006
		io_uring_try_cancel_requests(ctx, NULL, true);
7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017
		if (ctx->sq_data) {
			struct io_sq_data *sqd = ctx->sq_data;
			struct task_struct *tsk;

			io_sq_thread_park(sqd);
			tsk = sqd->thread;
			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
				io_wq_cancel_cb(tsk->io_uring->io_wq,
						io_cancel_ctx_cb, ctx, true);
			io_sq_thread_unpark(sqd);
		}
7018

7019 7020
		io_req_caches_free(ctx);

7021 7022 7023 7024 7025
		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
			/* there is little hope left, don't run it too often */
			interval = HZ * 60;
		}
	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
7026

7027 7028 7029
	init_completion(&exit.completion);
	init_task_work(&exit.task_work, io_tctx_exit_cb);
	exit.ctx = ctx;
7030 7031 7032
	/*
	 * Some may use context even when all refs and requests have been put,
	 * and they are free to do so while still holding uring_lock or
7033
	 * completion_lock, see io_req_task_submit(). Apart from other work,
7034 7035
	 * this lock/unlock section also waits them to finish.
	 */
7036 7037
	mutex_lock(&ctx->uring_lock);
	while (!list_empty(&ctx->tctx_list)) {
7038 7039
		WARN_ON_ONCE(time_after(jiffies, timeout));

7040 7041
		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
					ctx_node);
7042 7043
		/* don't spin on a single task if cancellation failed */
		list_rotate_left(&ctx->tctx_list);
7044 7045 7046 7047 7048 7049 7050 7051 7052
		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
		if (WARN_ON_ONCE(ret))
			continue;

		mutex_unlock(&ctx->uring_lock);
		wait_for_completion(&exit.completion);
		mutex_lock(&ctx->uring_lock);
	}
	mutex_unlock(&ctx->uring_lock);
7053 7054
	spin_lock(&ctx->completion_lock);
	spin_unlock(&ctx->completion_lock);
7055

7056 7057 7058
	io_ring_ctx_free(ctx);
}

P
Pavel Begunkov 已提交
7059
static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
7060
{
7061 7062 7063
	unsigned long index;
	struct creds *creds;

J
Jens Axboe 已提交
7064 7065
	mutex_lock(&ctx->uring_lock);
	percpu_ref_kill(&ctx->refs);
7066
	if (ctx->rings)
7067
		__io_cqring_overflow_flush(ctx, true);
7068 7069
	xa_for_each(&ctx->personalities, index, creds)
		io_unregister_personality(ctx, index);
J
Jens Axboe 已提交
7070 7071
	mutex_unlock(&ctx->uring_lock);

7072 7073 7074 7075 7076 7077 7078
	/* failed during ring init, it couldn't have issued any requests */
	if (ctx->rings) {
		io_kill_timeouts(ctx, NULL, true);
		io_poll_remove_all(ctx, NULL, true);
		/* if we failed setting up the ctx, we might not have any rings */
		io_iopoll_try_reap_events(ctx);
	}
7079

7080
	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
7081 7082 7083 7084 7085 7086 7087
	/*
	 * Use system_unbound_wq to avoid spawning tons of event kworkers
	 * if we're exiting a ton of rings at the same time. It just adds
	 * noise and overhead, there's no discernable change in runtime
	 * over using system_wq.
	 */
	queue_work(system_unbound_wq, &ctx->exit_work);
J
Jens Axboe 已提交
7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098
}

static int io_uring_release(struct inode *inode, struct file *file)
{
	struct io_ring_ctx *ctx = file->private_data;

	file->private_data = NULL;
	io_ring_ctx_wait_and_kill(ctx);
	return 0;
}

7099 7100
struct io_task_cancel {
	struct task_struct *task;
7101
	bool all;
7102
};
7103

7104
static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
7105
{
7106
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7107
	struct io_task_cancel *cancel = data;
7108

7109
	return io_match_task_safe(req, cancel->task, cancel->all);
7110 7111
}

P
Pavel Begunkov 已提交
7112 7113 7114
static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
					 struct task_struct *task,
					 bool cancel_all)
7115
{
7116
	struct io_defer_entry *de;
7117 7118
	LIST_HEAD(list);

7119
	spin_lock(&ctx->completion_lock);
7120
	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
7121
		if (io_match_task_safe(de->req, task, cancel_all)) {
7122 7123 7124 7125
			list_cut_position(&list, &ctx->defer_list, &de->list);
			break;
		}
	}
7126
	spin_unlock(&ctx->completion_lock);
7127 7128
	if (list_empty(&list))
		return false;
7129 7130 7131 7132

	while (!list_empty(&list)) {
		de = list_first_entry(&list, struct io_defer_entry, list);
		list_del_init(&de->list);
7133
		io_req_complete_failed(de->req, -ECANCELED);
7134 7135
		kfree(de);
	}
7136
	return true;
7137 7138
}

P
Pavel Begunkov 已提交
7139
static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162
{
	struct io_tctx_node *node;
	enum io_wq_cancel cret;
	bool ret = false;

	mutex_lock(&ctx->uring_lock);
	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
		struct io_uring_task *tctx = node->task->io_uring;

		/*
		 * io_wq will stay alive while we hold uring_lock, because it's
		 * killed after ctx nodes, which requires to take the lock.
		 */
		if (!tctx || !tctx->io_wq)
			continue;
		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
	}
	mutex_unlock(&ctx->uring_lock);

	return ret;
}

P
Pavel Begunkov 已提交
7163 7164 7165
static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
						struct task_struct *task,
						bool cancel_all)
7166
{
7167
	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
7168
	struct io_uring_task *tctx = task ? task->io_uring : NULL;
7169

7170 7171 7172 7173
	/* failed during ring init, it couldn't have issued any requests */
	if (!ctx->rings)
		return;

7174 7175 7176 7177
	while (1) {
		enum io_wq_cancel cret;
		bool ret = false;

7178 7179 7180 7181 7182 7183 7184
		if (!task) {
			ret |= io_uring_try_cancel_iowq(ctx);
		} else if (tctx && tctx->io_wq) {
			/*
			 * Cancels requests of all rings, not only @ctx, but
			 * it's fine as the task is in exit/exec.
			 */
7185
			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
7186 7187 7188 7189 7190
					       &cancel, true);
			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
		}

		/* SQPOLL thread does its own polling */
7191
		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
7192
		    (ctx->sq_data && ctx->sq_data->thread == current)) {
7193
			while (!wq_list_empty(&ctx->iopoll_list)) {
7194 7195 7196 7197 7198
				io_iopoll_try_reap_events(ctx);
				ret = true;
			}
		}

7199 7200 7201
		ret |= io_cancel_defer_files(ctx, task, cancel_all);
		ret |= io_poll_remove_all(ctx, task, cancel_all);
		ret |= io_kill_timeouts(ctx, task, cancel_all);
7202 7203
		if (task)
			ret |= io_run_task_work();
7204 7205 7206 7207 7208 7209
		if (!ret)
			break;
		cond_resched();
	}
}

7210
static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
7211
{
7212
	struct io_uring_task *tctx = current->io_uring;
7213
	struct io_tctx_node *node;
7214
	int ret;
7215 7216

	if (unlikely(!tctx)) {
7217
		ret = io_uring_alloc_task_context(current, ctx);
7218 7219
		if (unlikely(ret))
			return ret;
7220

7221
		tctx = current->io_uring;
7222 7223 7224 7225 7226 7227 7228 7229
		if (ctx->iowq_limits_set) {
			unsigned int limits[2] = { ctx->iowq_limits[0],
						   ctx->iowq_limits[1], };

			ret = io_wq_max_workers(tctx->io_wq, limits);
			if (ret)
				return ret;
		}
7230
	}
7231 7232 7233 7234 7235 7236
	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
		node = kmalloc(sizeof(*node), GFP_KERNEL);
		if (!node)
			return -ENOMEM;
		node->ctx = ctx;
		node->task = current;
7237

7238 7239 7240 7241 7242
		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
					node, GFP_KERNEL));
		if (ret) {
			kfree(node);
			return ret;
7243
		}
7244 7245 7246 7247

		mutex_lock(&ctx->uring_lock);
		list_add(&node->ctx_node, &ctx->tctx_list);
		mutex_unlock(&ctx->uring_lock);
7248
	}
7249
	tctx->last = ctx;
7250 7251 7252
	return 0;
}

7253 7254 7255
/*
 * Note that this task has used io_uring. We use it for cancelation purposes.
 */
7256
static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
7257 7258 7259 7260 7261
{
	struct io_uring_task *tctx = current->io_uring;

	if (likely(tctx && tctx->last == ctx))
		return 0;
7262
	return __io_uring_add_tctx_node(ctx);
7263 7264
}

7265 7266 7267
/*
 * Remove this io_uring_file -> task mapping.
 */
P
Pavel Begunkov 已提交
7268
static __cold void io_uring_del_tctx_node(unsigned long index)
7269 7270
{
	struct io_uring_task *tctx = current->io_uring;
7271
	struct io_tctx_node *node;
7272

7273 7274
	if (!tctx)
		return;
7275 7276
	node = xa_erase(&tctx->xa, index);
	if (!node)
7277
		return;
7278

7279 7280 7281 7282 7283 7284 7285
	WARN_ON_ONCE(current != node->task);
	WARN_ON_ONCE(list_empty(&node->ctx_node));

	mutex_lock(&node->ctx->uring_lock);
	list_del(&node->ctx_node);
	mutex_unlock(&node->ctx->uring_lock);

7286
	if (tctx->last == node->ctx)
7287
		tctx->last = NULL;
7288
	kfree(node);
7289 7290
}

P
Pavel Begunkov 已提交
7291
static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
7292
{
7293
	struct io_wq *wq = tctx->io_wq;
7294
	struct io_tctx_node *node;
7295 7296
	unsigned long index;

7297
	xa_for_each(&tctx->xa, index, node) {
7298
		io_uring_del_tctx_node(index);
7299 7300
		cond_resched();
	}
7301 7302
	if (wq) {
		/*
7303
		 * Must be after io_uring_del_tctx_node() (removes nodes under
7304 7305
		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
		 */
7306
		io_wq_put_and_exit(wq);
7307
		tctx->io_wq = NULL;
7308
	}
7309 7310
}

7311
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
7312
{
7313
	if (tracked)
7314
		return atomic_read(&tctx->inflight_tracked);
7315 7316 7317
	return percpu_counter_sum(&tctx->inflight);
}

7318 7319
/*
 * Find any io_uring ctx that this task has registered or done IO on, and cancel
7320
 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
7321
 */
7322
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
7323
{
7324
	struct io_uring_task *tctx = current->io_uring;
7325
	struct io_ring_ctx *ctx;
7326 7327
	s64 inflight;
	DEFINE_WAIT(wait);
7328

7329 7330
	WARN_ON_ONCE(sqd && sqd->thread != current);

7331 7332
	if (!current->io_uring)
		return;
7333 7334 7335
	if (tctx->io_wq)
		io_wq_exit_start(tctx->io_wq);

7336 7337
	atomic_inc(&tctx->in_idle);
	do {
7338
		io_uring_drop_tctx_refs(current);
7339
		/* read completions before cancelations */
7340
		inflight = tctx_inflight(tctx, !cancel_all);
7341 7342
		if (!inflight)
			break;
7343

7344 7345 7346
		if (!sqd) {
			struct io_tctx_node *node;
			unsigned long index;
7347

7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359
			xa_for_each(&tctx->xa, index, node) {
				/* sqpoll task will cancel all its requests */
				if (node->ctx->sq_data)
					continue;
				io_uring_try_cancel_requests(node->ctx, current,
							     cancel_all);
			}
		} else {
			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
				io_uring_try_cancel_requests(ctx, current,
							     cancel_all);
		}
7360

7361 7362
		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
		io_run_task_work();
7363
		io_uring_drop_tctx_refs(current);
7364

7365
		/*
7366 7367 7368
		 * If we've seen completions, retry without waiting. This
		 * avoids a race where a completion comes in before we did
		 * prepare_to_wait().
7369
		 */
7370
		if (inflight == tctx_inflight(tctx, !cancel_all))
7371
			schedule();
7372
		finish_wait(&tctx->wait, &wait);
7373
	} while (1);
7374

P
Pavel Begunkov 已提交
7375
	io_uring_clean_tctx(tctx);
7376
	if (cancel_all) {
7377 7378 7379 7380 7381
		/*
		 * We shouldn't run task_works after cancel, so just leave
		 * ->in_idle set for normal exit.
		 */
		atomic_dec(&tctx->in_idle);
7382 7383 7384
		/* for exec all current's requests should be gone, kill tctx */
		__io_uring_free(current);
	}
7385 7386
}

7387
void __io_uring_cancel(bool cancel_all)
7388
{
7389
	io_uring_cancel_generic(cancel_all, NULL);
7390 7391
}

7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418
void io_uring_unreg_ringfd(void)
{
	struct io_uring_task *tctx = current->io_uring;
	int i;

	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
		if (tctx->registered_rings[i]) {
			fput(tctx->registered_rings[i]);
			tctx->registered_rings[i] = NULL;
		}
	}
}

static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
				     int start, int end)
{
	struct file *file;
	int offset;

	for (offset = start; offset < end; offset++) {
		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
		if (tctx->registered_rings[offset])
			continue;

		file = fget(fd);
		if (!file) {
			return -EBADF;
7419
		} else if (!io_is_uring_fops(file)) {
7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463
			fput(file);
			return -EOPNOTSUPP;
		}
		tctx->registered_rings[offset] = file;
		return offset;
	}

	return -EBUSY;
}

/*
 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
 * invocation. User passes in an array of struct io_uring_rsrc_update
 * with ->data set to the ring_fd, and ->offset given for the desired
 * index. If no index is desired, application may set ->offset == -1U
 * and we'll find an available index. Returns number of entries
 * successfully processed, or < 0 on error if none were processed.
 */
static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
			      unsigned nr_args)
{
	struct io_uring_rsrc_update __user *arg = __arg;
	struct io_uring_rsrc_update reg;
	struct io_uring_task *tctx;
	int ret, i;

	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
		return -EINVAL;

	mutex_unlock(&ctx->uring_lock);
	ret = io_uring_add_tctx_node(ctx);
	mutex_lock(&ctx->uring_lock);
	if (ret)
		return ret;

	tctx = current->io_uring;
	for (i = 0; i < nr_args; i++) {
		int start, end;

		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
			ret = -EFAULT;
			break;
		}

7464 7465 7466 7467 7468
		if (reg.resv) {
			ret = -EINVAL;
			break;
		}

7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514
		if (reg.offset == -1U) {
			start = 0;
			end = IO_RINGFD_REG_MAX;
		} else {
			if (reg.offset >= IO_RINGFD_REG_MAX) {
				ret = -EINVAL;
				break;
			}
			start = reg.offset;
			end = start + 1;
		}

		ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
		if (ret < 0)
			break;

		reg.offset = ret;
		if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
			fput(tctx->registered_rings[reg.offset]);
			tctx->registered_rings[reg.offset] = NULL;
			ret = -EFAULT;
			break;
		}
	}

	return i ? i : ret;
}

static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
				unsigned nr_args)
{
	struct io_uring_rsrc_update __user *arg = __arg;
	struct io_uring_task *tctx = current->io_uring;
	struct io_uring_rsrc_update reg;
	int ret = 0, i;

	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
		return -EINVAL;
	if (!tctx)
		return 0;

	for (i = 0; i < nr_args; i++) {
		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
			ret = -EFAULT;
			break;
		}
7515
		if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529
			ret = -EINVAL;
			break;
		}

		reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
		if (tctx->registered_rings[reg.offset]) {
			fput(tctx->registered_rings[reg.offset]);
			tctx->registered_rings[reg.offset] = NULL;
		}
	}

	return i ? i : ret;
}

7530 7531
static void *io_uring_validate_mmap_request(struct file *file,
					    loff_t pgoff, size_t sz)
J
Jens Axboe 已提交
7532 7533
{
	struct io_ring_ctx *ctx = file->private_data;
7534
	loff_t offset = pgoff << PAGE_SHIFT;
J
Jens Axboe 已提交
7535 7536 7537 7538 7539
	struct page *page;
	void *ptr;

	switch (offset) {
	case IORING_OFF_SQ_RING:
7540 7541
	case IORING_OFF_CQ_RING:
		ptr = ctx->rings;
J
Jens Axboe 已提交
7542 7543 7544 7545 7546
		break;
	case IORING_OFF_SQES:
		ptr = ctx->sq_sqes;
		break;
	default:
7547
		return ERR_PTR(-EINVAL);
J
Jens Axboe 已提交
7548 7549 7550
	}

	page = virt_to_head_page(ptr);
7551
	if (sz > page_size(page))
7552 7553 7554 7555 7556 7557 7558
		return ERR_PTR(-EINVAL);

	return ptr;
}

#ifdef CONFIG_MMU

P
Pavel Begunkov 已提交
7559
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7560 7561 7562 7563 7564 7565 7566 7567
{
	size_t sz = vma->vm_end - vma->vm_start;
	unsigned long pfn;
	void *ptr;

	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
	if (IS_ERR(ptr))
		return PTR_ERR(ptr);
J
Jens Axboe 已提交
7568 7569 7570 7571 7572

	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599
#else /* !CONFIG_MMU */

static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
}

static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
{
	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
}

static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
	unsigned long addr, unsigned long len,
	unsigned long pgoff, unsigned long flags)
{
	void *ptr;

	ptr = io_uring_validate_mmap_request(file, pgoff, len);
	if (IS_ERR(ptr))
		return PTR_ERR(ptr);

	return (unsigned long) ptr;
}

#endif /* !CONFIG_MMU */

7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612
static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{
	if (flags & IORING_ENTER_EXT_ARG) {
		struct io_uring_getevents_arg arg;

		if (argsz != sizeof(arg))
			return -EINVAL;
		if (copy_from_user(&arg, argp, sizeof(arg)))
			return -EFAULT;
	}
	return 0;
}

7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
			  struct __kernel_timespec __user **ts,
			  const sigset_t __user **sig)
{
	struct io_uring_getevents_arg arg;

	/*
	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
	 * is just a pointer to the sigset_t.
	 */
	if (!(flags & IORING_ENTER_EXT_ARG)) {
		*sig = (const sigset_t __user *) argp;
		*ts = NULL;
		return 0;
	}

	/*
	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
	 * timespec and sigset_t pointers if good.
	 */
	if (*argsz != sizeof(arg))
		return -EINVAL;
	if (copy_from_user(&arg, argp, sizeof(arg)))
		return -EFAULT;
7637 7638
	if (arg.pad)
		return -EINVAL;
7639 7640 7641 7642 7643 7644
	*sig = u64_to_user_ptr(arg.sigmask);
	*argsz = arg.sigmask_sz;
	*ts = u64_to_user_ptr(arg.ts);
	return 0;
}

J
Jens Axboe 已提交
7645
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
7646 7647
		u32, min_complete, u32, flags, const void __user *, argp,
		size_t, argsz)
J
Jens Axboe 已提交
7648 7649 7650
{
	struct io_ring_ctx *ctx;
	struct fd f;
7651
	long ret;
J
Jens Axboe 已提交
7652

7653
	io_run_task_work();
7654

7655
	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
7656 7657
			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
			       IORING_ENTER_REGISTERED_RING)))
J
Jens Axboe 已提交
7658 7659
		return -EINVAL;

7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670
	/*
	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
	 * need only dereference our task private array to find it.
	 */
	if (flags & IORING_ENTER_REGISTERED_RING) {
		struct io_uring_task *tctx = current->io_uring;

		if (!tctx || fd >= IO_RINGFD_REG_MAX)
			return -EINVAL;
		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
		f.file = tctx->registered_rings[fd];
7671
		f.flags = 0;
7672 7673 7674
	} else {
		f = fdget(fd);
	}
J
Jens Axboe 已提交
7675

7676 7677 7678
	if (unlikely(!f.file))
		return -EBADF;

J
Jens Axboe 已提交
7679
	ret = -EOPNOTSUPP;
7680
	if (unlikely(!io_is_uring_fops(f.file)))
J
Jens Axboe 已提交
7681 7682 7683 7684
		goto out_fput;

	ret = -ENXIO;
	ctx = f.file->private_data;
7685
	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
J
Jens Axboe 已提交
7686 7687
		goto out_fput;

7688
	ret = -EBADFD;
7689
	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
7690 7691
		goto out;

J
Jens Axboe 已提交
7692 7693 7694 7695 7696
	/*
	 * For SQ polling, the thread will do all submissions and completions.
	 * Just return the requested submit count, and wake the thread if
	 * we were asked to.
	 */
7697
	ret = 0;
J
Jens Axboe 已提交
7698
	if (ctx->flags & IORING_SETUP_SQPOLL) {
7699
		io_cqring_overflow_flush(ctx);
7700

7701 7702
		if (unlikely(ctx->sq_data->thread == NULL)) {
			ret = -EOWNERDEAD;
7703
			goto out;
7704
		}
J
Jens Axboe 已提交
7705
		if (flags & IORING_ENTER_SQ_WAKEUP)
7706
			wake_up(&ctx->sq_data->wait);
7707 7708 7709 7710 7711
		if (flags & IORING_ENTER_SQ_WAIT) {
			ret = io_sqpoll_wait_sq(ctx);
			if (ret)
				goto out;
		}
7712
		ret = to_submit;
7713
	} else if (to_submit) {
7714
		ret = io_uring_add_tctx_node(ctx);
7715 7716
		if (unlikely(ret))
			goto out;
7717

J
Jens Axboe 已提交
7718
		mutex_lock(&ctx->uring_lock);
7719 7720
		ret = io_submit_sqes(ctx, to_submit);
		if (ret != to_submit) {
7721
			mutex_unlock(&ctx->uring_lock);
7722
			goto out;
7723 7724 7725 7726
		}
		if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
			goto iopoll_locked;
		mutex_unlock(&ctx->uring_lock);
J
Jens Axboe 已提交
7727 7728
	}
	if (flags & IORING_ENTER_GETEVENTS) {
7729
		int ret2;
7730
		if (ctx->syscall_iopoll) {
7731 7732 7733 7734 7735 7736 7737 7738
			/*
			 * We disallow the app entering submit/complete with
			 * polling, but we still need to lock the ring to
			 * prevent racing with polled issue that got punted to
			 * a workqueue.
			 */
			mutex_lock(&ctx->uring_lock);
iopoll_locked:
7739 7740 7741 7742 7743
			ret2 = io_validate_ext_arg(flags, argp, argsz);
			if (likely(!ret2)) {
				min_complete = min(min_complete,
						   ctx->cq_entries);
				ret2 = io_iopoll_check(ctx, min_complete);
7744 7745
			}
			mutex_unlock(&ctx->uring_lock);
J
Jens Axboe 已提交
7746
		} else {
7747 7748 7749
			const sigset_t __user *sig;
			struct __kernel_timespec __user *ts;

7750 7751 7752 7753 7754 7755 7756
			ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
			if (likely(!ret2)) {
				min_complete = min(min_complete,
						   ctx->cq_entries);
				ret2 = io_cqring_wait(ctx, min_complete, sig,
						      argsz, ts);
			}
J
Jens Axboe 已提交
7757
		}
7758

7759
		if (!ret) {
7760
			ret = ret2;
J
Jens Axboe 已提交
7761

7762 7763 7764 7765 7766 7767 7768 7769
			/*
			 * EBADR indicates that one or more CQE were dropped.
			 * Once the user has been informed we can clear the bit
			 * as they are obviously ok with those drops.
			 */
			if (unlikely(ret2 == -EBADR))
				clear_bit(IO_CHECK_CQ_DROPPED_BIT,
					  &ctx->check_cq);
J
Jens Axboe 已提交
7770
		}
J
Jens Axboe 已提交
7771 7772
	}

7773
out:
7774
	percpu_ref_put(&ctx->refs);
J
Jens Axboe 已提交
7775
out_fput:
7776
	fdput(f);
7777
	return ret;
J
Jens Axboe 已提交
7778 7779
}

7780
#ifdef CONFIG_PROC_FS
P
Pavel Begunkov 已提交
7781
static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
7782
		const struct cred *cred)
7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812
{
	struct user_namespace *uns = seq_user_ns(m);
	struct group_info *gi;
	kernel_cap_t cap;
	unsigned __capi;
	int g;

	seq_printf(m, "%5d\n", id);
	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
	seq_puts(m, "\n\tGroups:\t");
	gi = cred->group_info;
	for (g = 0; g < gi->ngroups; g++) {
		seq_put_decimal_ull(m, g ? " " : "",
					from_kgid_munged(uns, gi->gid[g]));
	}
	seq_puts(m, "\n\tCapEff:\t");
	cap = cred->cap_effective;
	CAP_FOR_EACH_U32(__capi)
		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
	seq_putc(m, '\n');
	return 0;
}

P
Pavel Begunkov 已提交
7813 7814
static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
					  struct seq_file *m)
7815
{
7816
	struct io_sq_data *sq = NULL;
7817 7818 7819 7820 7821 7822 7823
	struct io_overflow_cqe *ocqe;
	struct io_rings *r = ctx->rings;
	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
	unsigned int sq_head = READ_ONCE(r->sq.head);
	unsigned int sq_tail = READ_ONCE(r->sq.tail);
	unsigned int cq_head = READ_ONCE(r->cq.head);
	unsigned int cq_tail = READ_ONCE(r->cq.tail);
7824
	unsigned int cq_shift = 0;
7825
	unsigned int sq_entries, cq_entries;
7826
	bool has_lock;
7827
	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
7828 7829
	unsigned int i;

7830 7831 7832
	if (is_cqe32)
		cq_shift = 1;

7833 7834 7835 7836 7837 7838
	/*
	 * we may get imprecise sqe and cqe info if uring is actively running
	 * since we get cached_sq_head and cached_cq_tail without uring_lock
	 * and sq_tail and cq_head are changed by userspace. But it's ok since
	 * we usually use these info when it is stuck.
	 */
7839
	seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851
	seq_printf(m, "SqHead:\t%u\n", sq_head);
	seq_printf(m, "SqTail:\t%u\n", sq_tail);
	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
	seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
	seq_printf(m, "CqHead:\t%u\n", cq_head);
	seq_printf(m, "CqTail:\t%u\n", cq_tail);
	seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
	seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
	for (i = 0; i < sq_entries; i++) {
		unsigned int entry = i + sq_head;
		unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
J
Jens Axboe 已提交
7852
		struct io_uring_sqe *sqe;
7853 7854 7855 7856 7857 7858 7859

		if (sq_idx > sq_mask)
			continue;
		sqe = &ctx->sq_sqes[sq_idx];
		seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
			   sq_idx, sqe->opcode, sqe->fd, sqe->flags,
			   sqe->user_data);
7860
	}
7861 7862 7863 7864
	seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
	cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
	for (i = 0; i < cq_entries; i++) {
		unsigned int entry = i + cq_head;
7865
		struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
7866

7867 7868
		if (!is_cqe32) {
			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
7869 7870
			   entry & cq_mask, cqe->user_data, cqe->res,
			   cqe->flags);
7871 7872 7873 7874 7875 7876
		} else {
			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
				"extra1:%llu, extra2:%llu\n",
				entry & cq_mask, cqe->user_data, cqe->res,
				cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
		}
7877
	}
7878

7879 7880 7881 7882 7883 7884 7885 7886
	/*
	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
	 * since fdinfo case grabs it in the opposite direction of normal use
	 * cases. If we fail to get the lock, we just don't iterate any
	 * structures that could be going away outside the io_uring mutex.
	 */
	has_lock = mutex_trylock(&ctx->uring_lock);

7887
	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
7888
		sq = ctx->sq_data;
7889 7890 7891
		if (!sq->thread)
			sq = NULL;
	}
7892 7893 7894

	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
7895
	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7896
	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
7897
		struct file *f = io_file_from_index(ctx, i);
7898 7899 7900 7901 7902 7903 7904

		if (f)
			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
		else
			seq_printf(m, "%5u: <none>\n", i);
	}
	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7905
	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
7906
		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
7907
		unsigned int len = buf->ubuf_end - buf->ubuf;
7908

7909
		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
7910
	}
7911 7912 7913 7914
	if (has_lock && !xa_empty(&ctx->personalities)) {
		unsigned long index;
		const struct cred *cred;

7915
		seq_printf(m, "Personalities:\n");
7916 7917
		xa_for_each(&ctx->personalities, index, cred)
			io_uring_show_cred(m, index, cred);
7918
	}
7919 7920 7921 7922
	if (has_lock)
		mutex_unlock(&ctx->uring_lock);

	seq_puts(m, "PollList:\n");
7923
	spin_lock(&ctx->completion_lock);
7924 7925 7926 7927 7928 7929
	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
		struct hlist_head *list = &ctx->cancel_hash[i];
		struct io_kiocb *req;

		hlist_for_each_entry(req, list, hash_node)
			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
7930
					task_work_pending(req->task));
7931
	}
7932 7933 7934 7935 7936 7937 7938 7939 7940 7941

	seq_puts(m, "CqOverflowList:\n");
	list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
		struct io_uring_cqe *cqe = &ocqe->cqe;

		seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
			   cqe->user_data, cqe->res, cqe->flags);

	}

7942
	spin_unlock(&ctx->completion_lock);
7943 7944
}

P
Pavel Begunkov 已提交
7945
static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
7946 7947 7948 7949 7950 7951 7952 7953
{
	struct io_ring_ctx *ctx = f->private_data;

	if (percpu_ref_tryget(&ctx->refs)) {
		__io_uring_show_fdinfo(ctx, m);
		percpu_ref_put(&ctx->refs);
	}
}
7954
#endif
7955

J
Jens Axboe 已提交
7956 7957 7958
static const struct file_operations io_uring_fops = {
	.release	= io_uring_release,
	.mmap		= io_uring_mmap,
7959 7960 7961 7962
#ifndef CONFIG_MMU
	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
#endif
J
Jens Axboe 已提交
7963
	.poll		= io_uring_poll,
7964
#ifdef CONFIG_PROC_FS
7965
	.show_fdinfo	= io_uring_show_fdinfo,
7966
#endif
J
Jens Axboe 已提交
7967 7968
};

P
Pavel Begunkov 已提交
7969 7970
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
					 struct io_uring_params *p)
J
Jens Axboe 已提交
7971
{
7972 7973
	struct io_rings *rings;
	size_t size, sq_array_offset;
J
Jens Axboe 已提交
7974

7975 7976 7977 7978
	/* make sure these are sane, as we already accounted them */
	ctx->sq_entries = p->sq_entries;
	ctx->cq_entries = p->cq_entries;

7979
	size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
7980 7981 7982 7983 7984
	if (size == SIZE_MAX)
		return -EOVERFLOW;

	rings = io_mem_alloc(size);
	if (!rings)
J
Jens Axboe 已提交
7985 7986
		return -ENOMEM;

7987 7988 7989 7990 7991 7992
	ctx->rings = rings;
	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
	rings->sq_ring_mask = p->sq_entries - 1;
	rings->cq_ring_mask = p->cq_entries - 1;
	rings->sq_ring_entries = p->sq_entries;
	rings->cq_ring_entries = p->cq_entries;
J
Jens Axboe 已提交
7993

7994 7995 7996 7997
	if (p->flags & IORING_SETUP_SQE128)
		size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
	else
		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
7998 7999 8000
	if (size == SIZE_MAX) {
		io_mem_free(ctx->rings);
		ctx->rings = NULL;
J
Jens Axboe 已提交
8001
		return -EOVERFLOW;
8002
	}
J
Jens Axboe 已提交
8003 8004

	ctx->sq_sqes = io_mem_alloc(size);
8005 8006 8007
	if (!ctx->sq_sqes) {
		io_mem_free(ctx->rings);
		ctx->rings = NULL;
J
Jens Axboe 已提交
8008
		return -ENOMEM;
8009
	}
J
Jens Axboe 已提交
8010 8011 8012 8013

	return 0;
}

8014 8015 8016 8017 8018 8019 8020 8021
static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
{
	int ret, fd;

	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
	if (fd < 0)
		return fd;

8022
	ret = io_uring_add_tctx_node(ctx);
8023 8024 8025 8026 8027 8028 8029 8030
	if (ret) {
		put_unused_fd(fd);
		return ret;
	}
	fd_install(fd, file);
	return fd;
}

J
Jens Axboe 已提交
8031 8032 8033 8034 8035 8036
/*
 * Allocate an anonymous fd, this is what constitutes the application
 * visible backing of an io_uring instance. The application mmaps this
 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
 * we have to tie this fd to a socket for file garbage collection purposes.
 */
8037
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
8038 8039
{
	struct file *file;
8040
#if defined(CONFIG_UNIX)
J
Jens Axboe 已提交
8041 8042 8043 8044 8045
	int ret;

	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
				&ctx->ring_sock);
	if (ret)
8046
		return ERR_PTR(ret);
J
Jens Axboe 已提交
8047 8048
#endif

8049 8050
	file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
					 O_RDWR | O_CLOEXEC, NULL);
J
Jens Axboe 已提交
8051
#if defined(CONFIG_UNIX)
8052 8053 8054 8055 8056
	if (IS_ERR(file)) {
		sock_release(ctx->ring_sock);
		ctx->ring_sock = NULL;
	} else {
		ctx->ring_sock->file = file;
8057
	}
J
Jens Axboe 已提交
8058
#endif
8059
	return file;
J
Jens Axboe 已提交
8060 8061
}

P
Pavel Begunkov 已提交
8062 8063
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
				  struct io_uring_params __user *params)
J
Jens Axboe 已提交
8064 8065
{
	struct io_ring_ctx *ctx;
8066
	struct file *file;
J
Jens Axboe 已提交
8067 8068
	int ret;

8069
	if (!entries)
J
Jens Axboe 已提交
8070
		return -EINVAL;
8071 8072 8073 8074 8075
	if (entries > IORING_MAX_ENTRIES) {
		if (!(p->flags & IORING_SETUP_CLAMP))
			return -EINVAL;
		entries = IORING_MAX_ENTRIES;
	}
J
Jens Axboe 已提交
8076 8077 8078 8079 8080

	/*
	 * Use twice as many entries for the CQ ring. It's possible for the
	 * application to drive a higher depth than the size of the SQ ring,
	 * since the sqes are only used at submission time. This allows for
8081 8082 8083
	 * some flexibility in overcommitting a bit. If the application has
	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
	 * of CQ ring entries manually.
J
Jens Axboe 已提交
8084 8085
	 */
	p->sq_entries = roundup_pow_of_two(entries);
8086 8087 8088 8089 8090 8091
	if (p->flags & IORING_SETUP_CQSIZE) {
		/*
		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
		 * to a power-of-two, if it isn't already. We do NOT impose
		 * any cq vs sq ring sizing.
		 */
8092
		if (!p->cq_entries)
8093
			return -EINVAL;
8094 8095 8096 8097 8098
		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
			if (!(p->flags & IORING_SETUP_CLAMP))
				return -EINVAL;
			p->cq_entries = IORING_MAX_CQ_ENTRIES;
		}
8099 8100 8101
		p->cq_entries = roundup_pow_of_two(p->cq_entries);
		if (p->cq_entries < p->sq_entries)
			return -EINVAL;
8102 8103 8104
	} else {
		p->cq_entries = 2 * p->sq_entries;
	}
J
Jens Axboe 已提交
8105 8106

	ctx = io_ring_ctx_alloc(p);
J
Jens Axboe 已提交
8107
	if (!ctx)
J
Jens Axboe 已提交
8108
		return -ENOMEM;
8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119

	/*
	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
	 * space applications don't need to do io completion events
	 * polling again, they can rely on io_sq_thread to do polling
	 * work, which can reduce cpu usage and uring_lock contention.
	 */
	if (ctx->flags & IORING_SETUP_IOPOLL &&
	    !(ctx->flags & IORING_SETUP_SQPOLL))
		ctx->syscall_iopoll = 1;

J
Jens Axboe 已提交
8120
	ctx->compat = in_compat_syscall();
J
Jens Axboe 已提交
8121 8122
	if (!capable(CAP_IPC_LOCK))
		ctx->user = get_uid(current_user());
8123

8124
	/*
8125 8126
	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
	 * COOP_TASKRUN is set, then IPIs are never needed by the app.
8127
	 */
8128 8129 8130
	ret = -EINVAL;
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		/* IPI related flags don't make sense with SQPOLL */
8131 8132
		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
				  IORING_SETUP_TASKRUN_FLAG))
8133
			goto err;
8134
		ctx->notify_method = TWA_SIGNAL_NO_IPI;
8135 8136 8137
	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
		ctx->notify_method = TWA_SIGNAL_NO_IPI;
	} else {
8138 8139
		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
			goto err;
8140
		ctx->notify_method = TWA_SIGNAL;
8141
	}
8142

8143 8144 8145 8146 8147 8148
	/*
	 * This is just grabbed for accounting purposes. When a process exits,
	 * the mm is exited and dropped before the files, hence we need to hang
	 * on to this mm purely for the purposes of being able to unaccount
	 * memory (locked/pinned vm). It's not used for anything else.
	 */
8149
	mmgrab(current->mm);
8150
	ctx->mm_account = current->mm;
8151

J
Jens Axboe 已提交
8152 8153 8154 8155
	ret = io_allocate_scq_urings(ctx, p);
	if (ret)
		goto err;

8156
	ret = io_sq_offload_create(ctx, p);
J
Jens Axboe 已提交
8157 8158
	if (ret)
		goto err;
8159
	/* always set a rsrc node */
8160 8161 8162
	ret = io_rsrc_node_switch_start(ctx);
	if (ret)
		goto err;
8163
	io_rsrc_node_switch(ctx, NULL);
J
Jens Axboe 已提交
8164 8165

	memset(&p->sq_off, 0, sizeof(p->sq_off));
8166 8167 8168 8169 8170 8171 8172
	p->sq_off.head = offsetof(struct io_rings, sq.head);
	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
J
Jens Axboe 已提交
8173 8174

	memset(&p->cq_off, 0, sizeof(p->cq_off));
8175 8176 8177 8178 8179 8180
	p->cq_off.head = offsetof(struct io_rings, cq.head);
	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
	p->cq_off.cqes = offsetof(struct io_rings, cqes);
8181
	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
8182

8183 8184
	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
8185
			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
8186
			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
8187
			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
8188 8189
			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
			IORING_FEAT_LINKED_FILE;
8190 8191 8192 8193 8194

	if (copy_to_user(params, p, sizeof(*p))) {
		ret = -EFAULT;
		goto err;
	}
8195

8196 8197 8198 8199 8200 8201
	file = io_uring_get_file(ctx);
	if (IS_ERR(file)) {
		ret = PTR_ERR(file);
		goto err;
	}

8202 8203 8204 8205
	/*
	 * Install ring fd as the very last thing, so we don't risk someone
	 * having closed it before we finish setup
	 */
8206 8207 8208 8209 8210 8211
	ret = io_uring_install_fd(ctx, file);
	if (ret < 0) {
		/* fput will clean it up */
		fput(file);
		return ret;
	}
8212

8213
	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
J
Jens Axboe 已提交
8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236
	return ret;
err:
	io_ring_ctx_wait_and_kill(ctx);
	return ret;
}

/*
 * Sets up an aio uring context, and returns the fd. Applications asks for a
 * ring size, we return the actual sq/cq ring sizes (among other things) in the
 * params structure passed in.
 */
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
	struct io_uring_params p;
	int i;

	if (copy_from_user(&p, params, sizeof(p)))
		return -EFAULT;
	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
		if (p.resv[i])
			return -EINVAL;
	}

J
Jens Axboe 已提交
8237
	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8238
			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
8239
			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
8240
			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
8241
			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
S
Stefan Roesch 已提交
8242
			IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
J
Jens Axboe 已提交
8243 8244
		return -EINVAL;

8245
	return io_uring_create(entries, &p, params);
J
Jens Axboe 已提交
8246 8247 8248 8249 8250 8251 8252 8253
}

SYSCALL_DEFINE2(io_uring_setup, u32, entries,
		struct io_uring_params __user *, params)
{
	return io_uring_setup(entries, params);
}

P
Pavel Begunkov 已提交
8254 8255
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
			   unsigned nr_args)
8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293
{
	struct io_uring_probe *p;
	size_t size;
	int i, ret;

	size = struct_size(p, ops, nr_args);
	if (size == SIZE_MAX)
		return -EOVERFLOW;
	p = kzalloc(size, GFP_KERNEL);
	if (!p)
		return -ENOMEM;

	ret = -EFAULT;
	if (copy_from_user(p, arg, size))
		goto out;
	ret = -EINVAL;
	if (memchr_inv(p, 0, size))
		goto out;

	p->last_op = IORING_OP_LAST - 1;
	if (nr_args > IORING_OP_LAST)
		nr_args = IORING_OP_LAST;

	for (i = 0; i < nr_args; i++) {
		p->ops[i].op = i;
		if (!io_op_defs[i].not_supported)
			p->ops[i].flags = IO_URING_OP_SUPPORTED;
	}
	p->ops_len = i;

	ret = 0;
	if (copy_to_user(arg, p, size))
		ret = -EFAULT;
out:
	kfree(p);
	return ret;
}

8294 8295
static int io_register_personality(struct io_ring_ctx *ctx)
{
J
Jens Axboe 已提交
8296
	const struct cred *creds;
8297
	u32 id;
J
Jens Axboe 已提交
8298
	int ret;
8299

J
Jens Axboe 已提交
8300
	creds = get_current_cred();
J
Jens Axboe 已提交
8301

8302 8303
	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
8304 8305 8306 8307 8308
	if (ret < 0) {
		put_cred(creds);
		return ret;
	}
	return id;
8309 8310
}

P
Pavel Begunkov 已提交
8311 8312
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
					   void __user *arg, unsigned int nr_args)
8313 8314 8315 8316 8317
{
	struct io_uring_restriction *res;
	size_t size;
	int i, ret;

8318 8319 8320 8321
	/* Restrictions allowed only if rings started disabled */
	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EBADFD;

8322
	/* We allow only a single restrictions registration */
8323
	if (ctx->restrictions.registered)
8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374
		return -EBUSY;

	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
		return -EINVAL;

	size = array_size(nr_args, sizeof(*res));
	if (size == SIZE_MAX)
		return -EOVERFLOW;

	res = memdup_user(arg, size);
	if (IS_ERR(res))
		return PTR_ERR(res);

	ret = 0;

	for (i = 0; i < nr_args; i++) {
		switch (res[i].opcode) {
		case IORING_RESTRICTION_REGISTER_OP:
			if (res[i].register_op >= IORING_REGISTER_LAST) {
				ret = -EINVAL;
				goto out;
			}

			__set_bit(res[i].register_op,
				  ctx->restrictions.register_op);
			break;
		case IORING_RESTRICTION_SQE_OP:
			if (res[i].sqe_op >= IORING_OP_LAST) {
				ret = -EINVAL;
				goto out;
			}

			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
			break;
		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
			break;
		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
			break;
		default:
			ret = -EINVAL;
			goto out;
		}
	}

out:
	/* Reset all restrictions if an error happened */
	if (ret != 0)
		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
	else
8375
		ctx->restrictions.registered = true;
8376 8377 8378 8379 8380

	kfree(res);
	return ret;
}

8381 8382 8383 8384 8385 8386 8387 8388
static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EBADFD;

	if (ctx->restrictions.registered)
		ctx->restricted = 1;

8389 8390 8391
	ctx->flags &= ~IORING_SETUP_R_DISABLED;
	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
		wake_up(&ctx->sq_data->wait);
8392 8393 8394
	return 0;
}

8395
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
8396
				     struct io_uring_rsrc_update2 *up,
8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407
				     unsigned nr_args)
{
	__u32 tmp;
	int err;

	if (check_add_overflow(up->offset, nr_args, &tmp))
		return -EOVERFLOW;
	err = io_rsrc_node_switch_start(ctx);
	if (err)
		return err;

8408 8409
	switch (type) {
	case IORING_RSRC_FILE:
8410
		return __io_sqe_files_update(ctx, up, nr_args);
8411 8412
	case IORING_RSRC_BUFFER:
		return __io_sqe_buffers_update(ctx, up, nr_args);
8413 8414 8415 8416
	}
	return -EINVAL;
}

8417 8418
static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
				    unsigned nr_args)
8419
{
8420
	struct io_uring_rsrc_update2 up;
8421 8422 8423

	if (!nr_args)
		return -EINVAL;
8424 8425 8426
	memset(&up, 0, sizeof(up));
	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
		return -EFAULT;
8427
	if (up.resv || up.resv2)
8428
		return -EINVAL;
8429 8430 8431 8432
	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
}

static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
8433
				   unsigned size, unsigned type)
8434 8435 8436 8437 8438
{
	struct io_uring_rsrc_update2 up;

	if (size != sizeof(up))
		return -EINVAL;
8439 8440
	if (copy_from_user(&up, arg, sizeof(up)))
		return -EFAULT;
8441
	if (!up.nr || up.resv || up.resv2)
8442
		return -EINVAL;
8443
	return __io_register_rsrc_update(ctx, type, &up, up.nr);
8444 8445
}

P
Pavel Begunkov 已提交
8446
static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
8447
			    unsigned int size, unsigned int type)
8448 8449 8450 8451 8452 8453 8454 8455 8456 8457
{
	struct io_uring_rsrc_register rr;

	/* keep it extendible */
	if (size != sizeof(rr))
		return -EINVAL;

	memset(&rr, 0, sizeof(rr));
	if (copy_from_user(&rr, arg, size))
		return -EFAULT;
8458 8459 8460
	if (!rr.nr || rr.resv2)
		return -EINVAL;
	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
8461 8462
		return -EINVAL;

8463
	switch (type) {
8464
	case IORING_RSRC_FILE:
8465 8466
		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
			break;
8467 8468
		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
					     rr.nr, u64_to_user_ptr(rr.tags));
8469
	case IORING_RSRC_BUFFER:
8470
		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
8471
			break;
8472 8473
		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
					       rr.nr, u64_to_user_ptr(rr.tags));
8474 8475 8476 8477
	}
	return -EINVAL;
}

P
Pavel Begunkov 已提交
8478 8479
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
				       void __user *arg, unsigned len)
8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494
{
	struct io_uring_task *tctx = current->io_uring;
	cpumask_var_t new_mask;
	int ret;

	if (!tctx || !tctx->io_wq)
		return -EINVAL;

	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
		return -ENOMEM;

	cpumask_clear(new_mask);
	if (len > cpumask_size())
		len = cpumask_size();

8495 8496 8497 8498 8499 8500 8501 8502 8503
	if (in_compat_syscall()) {
		ret = compat_get_bitmap(cpumask_bits(new_mask),
					(const compat_ulong_t __user *)arg,
					len * 8 /* CHAR_BIT */);
	} else {
		ret = copy_from_user(new_mask, arg, len);
	}

	if (ret) {
8504 8505 8506 8507 8508 8509 8510 8511 8512
		free_cpumask_var(new_mask);
		return -EFAULT;
	}

	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
	free_cpumask_var(new_mask);
	return ret;
}

P
Pavel Begunkov 已提交
8513
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
8514 8515 8516 8517 8518 8519 8520 8521 8522
{
	struct io_uring_task *tctx = current->io_uring;

	if (!tctx || !tctx->io_wq)
		return -EINVAL;

	return io_wq_cpu_affinity(tctx->io_wq, NULL);
}

P
Pavel Begunkov 已提交
8523 8524
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
					       void __user *arg)
8525
	__must_hold(&ctx->uring_lock)
8526
{
8527
	struct io_tctx_node *node;
8528 8529
	struct io_uring_task *tctx = NULL;
	struct io_sq_data *sqd = NULL;
8530 8531 8532 8533 8534 8535 8536 8537 8538
	__u32 new_count[2];
	int i, ret;

	if (copy_from_user(new_count, arg, sizeof(new_count)))
		return -EFAULT;
	for (i = 0; i < ARRAY_SIZE(new_count); i++)
		if (new_count[i] > INT_MAX)
			return -EINVAL;

8539 8540 8541
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		sqd = ctx->sq_data;
		if (sqd) {
8542 8543 8544 8545 8546
			/*
			 * Observe the correct sqd->lock -> ctx->uring_lock
			 * ordering. Fine to drop uring_lock here, we hold
			 * a ref to the ctx.
			 */
8547
			refcount_inc(&sqd->refs);
8548
			mutex_unlock(&ctx->uring_lock);
8549
			mutex_lock(&sqd->lock);
8550
			mutex_lock(&ctx->uring_lock);
8551 8552
			if (sqd->thread)
				tctx = sqd->thread->io_uring;
8553 8554 8555 8556 8557
		}
	} else {
		tctx = current->io_uring;
	}

8558
	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
8559

8560 8561 8562
	for (i = 0; i < ARRAY_SIZE(new_count); i++)
		if (new_count[i])
			ctx->iowq_limits[i] = new_count[i];
8563 8564 8565 8566 8567 8568 8569 8570 8571
	ctx->iowq_limits_set = true;

	if (tctx && tctx->io_wq) {
		ret = io_wq_max_workers(tctx->io_wq, new_count);
		if (ret)
			goto err;
	} else {
		memset(new_count, 0, sizeof(new_count));
	}
8572

8573
	if (sqd) {
8574
		mutex_unlock(&sqd->lock);
8575 8576
		io_put_sq_data(sqd);
	}
8577 8578 8579 8580

	if (copy_to_user(arg, new_count, sizeof(new_count)))
		return -EFAULT;

8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596
	/* that's it for SQPOLL, only the SQPOLL task creates requests */
	if (sqd)
		return 0;

	/* now propagate the restriction to all registered users */
	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
		struct io_uring_task *tctx = node->task->io_uring;

		if (WARN_ON_ONCE(!tctx->io_wq))
			continue;

		for (i = 0; i < ARRAY_SIZE(new_count); i++)
			new_count[i] = ctx->iowq_limits[i];
		/* ignore errors, it always returns zero anyway */
		(void)io_wq_max_workers(tctx->io_wq, new_count);
	}
8597
	return 0;
8598
err:
8599
	if (sqd) {
8600
		mutex_unlock(&sqd->lock);
8601 8602
		io_put_sq_data(sqd);
	}
8603
	return ret;
8604 8605
}

8606 8607 8608 8609
static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
	struct io_uring_buf_ring *br;
	struct io_uring_buf_reg reg;
8610
	struct io_buffer_list *bl, *free_bl = NULL;
8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625
	struct page **pages;
	int nr_pages;

	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;

	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
		return -EINVAL;
	if (!reg.ring_addr)
		return -EFAULT;
	if (reg.ring_addr & ~PAGE_MASK)
		return -EINVAL;
	if (!is_power_of_2(reg.ring_entries))
		return -EINVAL;

8626 8627 8628 8629
	/* cannot disambiguate full vs empty due to head/tail size */
	if (reg.ring_entries >= 65536)
		return -EINVAL;

8630 8631 8632 8633 8634 8635 8636
	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
		int ret = io_init_bl_list(ctx);
		if (ret)
			return ret;
	}

	bl = io_buffer_get_list(ctx, reg.bgid);
8637 8638 8639 8640 8641
	if (bl) {
		/* if mapped buffer ring OR classic exists, don't allow */
		if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
			return -EEXIST;
	} else {
8642
		free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
8643 8644 8645 8646 8647 8648 8649 8650
		if (!bl)
			return -ENOMEM;
	}

	pages = io_pin_pages(reg.ring_addr,
			     struct_size(br, bufs, reg.ring_entries),
			     &nr_pages);
	if (IS_ERR(pages)) {
8651
		kfree(free_bl);
8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688
		return PTR_ERR(pages);
	}

	br = page_address(pages[0]);
	bl->buf_pages = pages;
	bl->buf_nr_pages = nr_pages;
	bl->nr_entries = reg.ring_entries;
	bl->buf_ring = br;
	bl->mask = reg.ring_entries - 1;
	io_buffer_add_list(ctx, bl, reg.bgid);
	return 0;
}

static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
	struct io_uring_buf_reg reg;
	struct io_buffer_list *bl;

	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;
	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
		return -EINVAL;

	bl = io_buffer_get_list(ctx, reg.bgid);
	if (!bl)
		return -ENOENT;
	if (!bl->buf_nr_pages)
		return -EINVAL;

	__io_remove_buffers(ctx, bl, -1U);
	if (bl->bgid >= BGID_ARRAY) {
		xa_erase(&ctx->io_bl_xa, bl->bgid);
		kfree(bl);
	}
	return 0;
}

8689 8690
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
			       void __user *arg, unsigned nr_args)
8691 8692
	__releases(ctx->uring_lock)
	__acquires(ctx->uring_lock)
8693 8694 8695
{
	int ret;

8696 8697 8698 8699 8700 8701 8702 8703
	/*
	 * We're inside the ring mutex, if the ref is already dying, then
	 * someone else killed the ctx or is already going through
	 * io_uring_register().
	 */
	if (percpu_ref_is_dying(&ctx->refs))
		return -ENXIO;

8704 8705 8706 8707 8708 8709 8710 8711
	if (ctx->restricted) {
		if (opcode >= IORING_REGISTER_LAST)
			return -EINVAL;
		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
		if (!test_bit(opcode, ctx->restrictions.register_op))
			return -EACCES;
	}

8712 8713
	switch (opcode) {
	case IORING_REGISTER_BUFFERS:
8714 8715 8716
		ret = -EFAULT;
		if (!arg)
			break;
8717
		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
8718 8719 8720 8721 8722
		break;
	case IORING_UNREGISTER_BUFFERS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
8723
		ret = io_sqe_buffers_unregister(ctx);
8724
		break;
J
Jens Axboe 已提交
8725
	case IORING_REGISTER_FILES:
8726 8727 8728
		ret = -EFAULT;
		if (!arg)
			break;
8729
		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
J
Jens Axboe 已提交
8730 8731 8732 8733 8734 8735 8736
		break;
	case IORING_UNREGISTER_FILES:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_sqe_files_unregister(ctx);
		break;
8737
	case IORING_REGISTER_FILES_UPDATE:
8738
		ret = io_register_files_update(ctx, arg, nr_args);
8739
		break;
8740 8741 8742 8743
	case IORING_REGISTER_EVENTFD:
		ret = -EINVAL;
		if (nr_args != 1)
			break;
8744 8745 8746 8747 8748
		ret = io_eventfd_register(ctx, arg, 0);
		break;
	case IORING_REGISTER_EVENTFD_ASYNC:
		ret = -EINVAL;
		if (nr_args != 1)
8749
			break;
8750
		ret = io_eventfd_register(ctx, arg, 1);
8751 8752 8753 8754 8755 8756 8757
		break;
	case IORING_UNREGISTER_EVENTFD:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_eventfd_unregister(ctx);
		break;
8758 8759 8760 8761 8762 8763
	case IORING_REGISTER_PROBE:
		ret = -EINVAL;
		if (!arg || nr_args > 256)
			break;
		ret = io_probe(ctx, arg, nr_args);
		break;
8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775
	case IORING_REGISTER_PERSONALITY:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_register_personality(ctx);
		break;
	case IORING_UNREGISTER_PERSONALITY:
		ret = -EINVAL;
		if (arg)
			break;
		ret = io_unregister_personality(ctx, nr_args);
		break;
8776 8777 8778 8779 8780 8781
	case IORING_REGISTER_ENABLE_RINGS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_register_enable_rings(ctx);
		break;
8782 8783 8784
	case IORING_REGISTER_RESTRICTIONS:
		ret = io_register_restrictions(ctx, arg, nr_args);
		break;
8785 8786 8787 8788 8789 8790 8791 8792 8793
	case IORING_REGISTER_FILES2:
		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
		break;
	case IORING_REGISTER_FILES_UPDATE2:
		ret = io_register_rsrc_update(ctx, arg, nr_args,
					      IORING_RSRC_FILE);
		break;
	case IORING_REGISTER_BUFFERS2:
		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
8794
		break;
8795 8796 8797
	case IORING_REGISTER_BUFFERS_UPDATE:
		ret = io_register_rsrc_update(ctx, arg, nr_args,
					      IORING_RSRC_BUFFER);
8798
		break;
8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810
	case IORING_REGISTER_IOWQ_AFF:
		ret = -EINVAL;
		if (!arg || !nr_args)
			break;
		ret = io_register_iowq_aff(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_IOWQ_AFF:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_unregister_iowq_aff(ctx);
		break;
8811 8812 8813 8814 8815 8816
	case IORING_REGISTER_IOWQ_MAX_WORKERS:
		ret = -EINVAL;
		if (!arg || nr_args != 2)
			break;
		ret = io_register_iowq_max_workers(ctx, arg);
		break;
8817 8818 8819 8820 8821 8822
	case IORING_REGISTER_RING_FDS:
		ret = io_ringfd_register(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_RING_FDS:
		ret = io_ringfd_unregister(ctx, arg, nr_args);
		break;
8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834
	case IORING_REGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_pbuf_ring(ctx, arg);
		break;
	case IORING_UNREGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_unregister_pbuf_ring(ctx, arg);
		break;
8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854
	default:
		ret = -EINVAL;
		break;
	}

	return ret;
}

SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
		void __user *, arg, unsigned int, nr_args)
{
	struct io_ring_ctx *ctx;
	long ret = -EBADF;
	struct fd f;

	f = fdget(fd);
	if (!f.file)
		return -EBADF;

	ret = -EOPNOTSUPP;
8855
	if (!io_is_uring_fops(f.file))
8856 8857 8858 8859
		goto out_fput;

	ctx = f.file->private_data;

8860 8861
	io_run_task_work();

8862 8863 8864
	mutex_lock(&ctx->uring_lock);
	ret = __io_uring_register(ctx, opcode, arg, nr_args);
	mutex_unlock(&ctx->uring_lock);
U
Usama Arif 已提交
8865
	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
8866 8867 8868 8869 8870
out_fput:
	fdput(f);
	return ret;
}

8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
	WARN_ON_ONCE(1);
	return -ECANCELED;
}

static const struct io_op_def io_op_defs[] = {
	[IORING_OP_NOP] = {
		.audit_skip		= 1,
		.iopoll			= 1,
		.prep			= io_nop_prep,
		.issue			= io_nop,
	},
	[IORING_OP_READV] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollin			= 1,
		.buffer_select		= 1,
		.plug			= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
		.iopoll			= 1,
		.async_size		= sizeof(struct io_async_rw),
		.prep			= io_prep_rw,
		.issue			= io_read,
8896
		.prep_async		= io_readv_prep_async,
8897
		.cleanup		= io_readv_writev_cleanup,
8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910
	},
	[IORING_OP_WRITEV] = {
		.needs_file		= 1,
		.hash_reg_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollout		= 1,
		.plug			= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
		.iopoll			= 1,
		.async_size		= sizeof(struct io_async_rw),
		.prep			= io_prep_rw,
		.issue			= io_write,
8911
		.prep_async		= io_writev_prep_async,
8912
		.cleanup		= io_readv_writev_cleanup,
8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967
	},
	[IORING_OP_FSYNC] = {
		.needs_file		= 1,
		.audit_skip		= 1,
		.prep			= io_fsync_prep,
		.issue			= io_fsync,
	},
	[IORING_OP_READ_FIXED] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollin			= 1,
		.plug			= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
		.iopoll			= 1,
		.async_size		= sizeof(struct io_async_rw),
		.prep			= io_prep_rw,
		.issue			= io_read,
	},
	[IORING_OP_WRITE_FIXED] = {
		.needs_file		= 1,
		.hash_reg_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollout		= 1,
		.plug			= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
		.iopoll			= 1,
		.async_size		= sizeof(struct io_async_rw),
		.prep			= io_prep_rw,
		.issue			= io_write,
	},
	[IORING_OP_POLL_ADD] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.audit_skip		= 1,
		.prep			= io_poll_add_prep,
		.issue			= io_poll_add,
	},
	[IORING_OP_POLL_REMOVE] = {
		.audit_skip		= 1,
		.prep			= io_poll_remove_prep,
		.issue			= io_poll_remove,
	},
	[IORING_OP_SYNC_FILE_RANGE] = {
		.needs_file		= 1,
		.audit_skip		= 1,
		.prep			= io_sfr_prep,
		.issue			= io_sync_file_range,
	},
	[IORING_OP_SENDMSG] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollout		= 1,
		.ioprio			= 1,
8968
#if defined(CONFIG_NET)
8969 8970 8971
		.async_size		= sizeof(struct io_async_msghdr),
		.prep			= io_sendmsg_prep,
		.issue			= io_sendmsg,
8972
		.prep_async		= io_sendmsg_prep_async,
8973
		.cleanup		= io_sendmsg_recvmsg_cleanup,
8974 8975
#else
		.prep			= io_eopnotsupp_prep,
8976
#endif
8977 8978 8979 8980 8981 8982 8983
	},
	[IORING_OP_RECVMSG] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollin			= 1,
		.buffer_select		= 1,
		.ioprio			= 1,
8984
#if defined(CONFIG_NET)
8985 8986 8987
		.async_size		= sizeof(struct io_async_msghdr),
		.prep			= io_recvmsg_prep,
		.issue			= io_recvmsg,
8988
		.prep_async		= io_recvmsg_prep_async,
8989
		.cleanup		= io_sendmsg_recvmsg_cleanup,
8990 8991
#else
		.prep			= io_eopnotsupp_prep,
8992
#endif
8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011
	},
	[IORING_OP_TIMEOUT] = {
		.audit_skip		= 1,
		.async_size		= sizeof(struct io_timeout_data),
		.prep			= io_timeout_prep,
		.issue			= io_timeout,
	},
	[IORING_OP_TIMEOUT_REMOVE] = {
		/* used by timeout updates' prep() */
		.audit_skip		= 1,
		.prep			= io_timeout_remove_prep,
		.issue			= io_timeout_remove,
	},
	[IORING_OP_ACCEPT] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollin			= 1,
		.poll_exclusive		= 1,
		.ioprio			= 1,	/* used for flags */
9012
#if defined(CONFIG_NET)
9013 9014
		.prep			= io_accept_prep,
		.issue			= io_accept,
9015 9016 9017
#else
		.prep			= io_eopnotsupp_prep,
#endif
9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033
	},
	[IORING_OP_ASYNC_CANCEL] = {
		.audit_skip		= 1,
		.prep			= io_async_cancel_prep,
		.issue			= io_async_cancel,
	},
	[IORING_OP_LINK_TIMEOUT] = {
		.audit_skip		= 1,
		.async_size		= sizeof(struct io_timeout_data),
		.prep			= io_link_timeout_prep,
		.issue			= io_no_issue,
	},
	[IORING_OP_CONNECT] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollout		= 1,
9034
#if defined(CONFIG_NET)
9035 9036 9037
		.async_size		= sizeof(struct io_async_connect),
		.prep			= io_connect_prep,
		.issue			= io_connect,
9038
		.prep_async		= io_connect_prep_async,
9039 9040 9041
#else
		.prep			= io_eopnotsupp_prep,
#endif
9042 9043 9044 9045 9046 9047 9048 9049 9050
	},
	[IORING_OP_FALLOCATE] = {
		.needs_file		= 1,
		.prep			= io_fallocate_prep,
		.issue			= io_fallocate,
	},
	[IORING_OP_OPENAT] = {
		.prep			= io_openat_prep,
		.issue			= io_openat,
9051
		.cleanup		= io_open_cleanup,
9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066
	},
	[IORING_OP_CLOSE] = {
		.prep			= io_close_prep,
		.issue			= io_close,
	},
	[IORING_OP_FILES_UPDATE] = {
		.audit_skip		= 1,
		.iopoll			= 1,
		.prep			= io_files_update_prep,
		.issue			= io_files_update,
	},
	[IORING_OP_STATX] = {
		.audit_skip		= 1,
		.prep			= io_statx_prep,
		.issue			= io_statx,
9067
		.cleanup		= io_statx_cleanup,
9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110
	},
	[IORING_OP_READ] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollin			= 1,
		.buffer_select		= 1,
		.plug			= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
		.iopoll			= 1,
		.async_size		= sizeof(struct io_async_rw),
		.prep			= io_prep_rw,
		.issue			= io_read,
	},
	[IORING_OP_WRITE] = {
		.needs_file		= 1,
		.hash_reg_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollout		= 1,
		.plug			= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
		.iopoll			= 1,
		.async_size		= sizeof(struct io_async_rw),
		.prep			= io_prep_rw,
		.issue			= io_write,
	},
	[IORING_OP_FADVISE] = {
		.needs_file		= 1,
		.audit_skip		= 1,
		.prep			= io_fadvise_prep,
		.issue			= io_fadvise,
	},
	[IORING_OP_MADVISE] = {
		.prep			= io_madvise_prep,
		.issue			= io_madvise,
	},
	[IORING_OP_SEND] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollout		= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
9111
#if defined(CONFIG_NET)
9112 9113
		.prep			= io_sendmsg_prep,
		.issue			= io_send,
9114 9115 9116
#else
		.prep			= io_eopnotsupp_prep,
#endif
9117 9118 9119 9120 9121 9122 9123 9124
	},
	[IORING_OP_RECV] = {
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollin			= 1,
		.buffer_select		= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
9125
#if defined(CONFIG_NET)
9126 9127
		.prep			= io_recvmsg_prep,
		.issue			= io_recv,
9128 9129 9130
#else
		.prep			= io_eopnotsupp_prep,
#endif
9131 9132 9133 9134
	},
	[IORING_OP_OPENAT2] = {
		.prep			= io_openat2_prep,
		.issue			= io_openat2,
9135
		.cleanup		= io_open_cleanup,
9136 9137 9138 9139
	},
	[IORING_OP_EPOLL_CTL] = {
		.unbound_nonreg_file	= 1,
		.audit_skip		= 1,
9140
#if defined(CONFIG_EPOLL)
9141 9142
		.prep			= io_epoll_ctl_prep,
		.issue			= io_epoll_ctl,
9143 9144 9145
#else
		.prep			= io_eopnotsupp_prep,
#endif
9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176
	},
	[IORING_OP_SPLICE] = {
		.needs_file		= 1,
		.hash_reg_file		= 1,
		.unbound_nonreg_file	= 1,
		.audit_skip		= 1,
		.prep			= io_splice_prep,
		.issue			= io_splice,
	},
	[IORING_OP_PROVIDE_BUFFERS] = {
		.audit_skip		= 1,
		.iopoll			= 1,
		.prep			= io_provide_buffers_prep,
		.issue			= io_provide_buffers,
	},
	[IORING_OP_REMOVE_BUFFERS] = {
		.audit_skip		= 1,
		.iopoll			= 1,
		.prep			= io_remove_buffers_prep,
		.issue			= io_remove_buffers,
	},
	[IORING_OP_TEE] = {
		.needs_file		= 1,
		.hash_reg_file		= 1,
		.unbound_nonreg_file	= 1,
		.audit_skip		= 1,
		.prep			= io_tee_prep,
		.issue			= io_tee,
	},
	[IORING_OP_SHUTDOWN] = {
		.needs_file		= 1,
9177
#if defined(CONFIG_NET)
9178 9179
		.prep			= io_shutdown_prep,
		.issue			= io_shutdown,
9180 9181 9182
#else
		.prep			= io_eopnotsupp_prep,
#endif
9183 9184 9185 9186
	},
	[IORING_OP_RENAMEAT] = {
		.prep			= io_renameat_prep,
		.issue			= io_renameat,
9187
		.cleanup		= io_renameat_cleanup,
9188 9189 9190 9191
	},
	[IORING_OP_UNLINKAT] = {
		.prep			= io_unlinkat_prep,
		.issue			= io_unlinkat,
9192
		.cleanup		= io_unlinkat_cleanup,
9193 9194 9195 9196
	},
	[IORING_OP_MKDIRAT] = {
		.prep			= io_mkdirat_prep,
		.issue			= io_mkdirat,
9197
		.cleanup		= io_mkdirat_cleanup,
9198 9199 9200 9201
	},
	[IORING_OP_SYMLINKAT] = {
		.prep			= io_symlinkat_prep,
		.issue			= io_symlinkat,
9202
		.cleanup		= io_link_cleanup,
9203 9204 9205 9206
	},
	[IORING_OP_LINKAT] = {
		.prep			= io_linkat_prep,
		.issue			= io_linkat,
9207
		.cleanup		= io_link_cleanup,
9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218
	},
	[IORING_OP_MSG_RING] = {
		.needs_file		= 1,
		.iopoll			= 1,
		.prep			= io_msg_ring_prep,
		.issue			= io_msg_ring,
	},
	[IORING_OP_FSETXATTR] = {
		.needs_file = 1,
		.prep			= io_fsetxattr_prep,
		.issue			= io_fsetxattr,
9219
		.cleanup		= io_xattr_cleanup,
9220 9221 9222 9223
	},
	[IORING_OP_SETXATTR] = {
		.prep			= io_setxattr_prep,
		.issue			= io_setxattr,
9224
		.cleanup		= io_xattr_cleanup,
9225 9226 9227 9228 9229
	},
	[IORING_OP_FGETXATTR] = {
		.needs_file = 1,
		.prep			= io_fgetxattr_prep,
		.issue			= io_fgetxattr,
9230
		.cleanup		= io_xattr_cleanup,
9231 9232 9233 9234
	},
	[IORING_OP_GETXATTR] = {
		.prep			= io_getxattr_prep,
		.issue			= io_getxattr,
9235
		.cleanup		= io_xattr_cleanup,
9236 9237 9238
	},
	[IORING_OP_SOCKET] = {
		.audit_skip		= 1,
9239
#if defined(CONFIG_NET)
9240 9241
		.prep			= io_socket_prep,
		.issue			= io_socket,
9242 9243 9244
#else
		.prep			= io_eopnotsupp_prep,
#endif
9245 9246 9247 9248 9249 9250 9251
	},
	[IORING_OP_URING_CMD] = {
		.needs_file		= 1,
		.plug			= 1,
		.async_size		= uring_cmd_pdu_size(1),
		.prep			= io_uring_cmd_prep,
		.issue			= io_uring_cmd,
9252
		.prep_async		= io_uring_cmd_prep_async,
9253 9254 9255
	},
};

J
Jens Axboe 已提交
9256 9257
static int __init io_uring_init(void)
{
9258 9259
	int i;

9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274
#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
} while (0)

#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
P
Pavel Begunkov 已提交
9275
	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
9276 9277 9278 9279 9280
	BUILD_BUG_SQE_ELEM(24, __u32,  len);
	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
9281 9282
	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
9283 9284 9285 9286 9287 9288 9289 9290
	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
P
Pavel Begunkov 已提交
9291
	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
9292 9293
	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
9294
	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
9295
	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
P
Pavel Begunkov 已提交
9296
	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
9297
	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
9298
	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
9299

9300 9301 9302 9303
	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
		     sizeof(struct io_uring_rsrc_update));
	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
		     sizeof(struct io_uring_rsrc_update2));
9304 9305 9306

	/* ->buf_index is u16 */
	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
9307
	BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
9308 9309 9310
	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
		     offsetof(struct io_uring_buf_ring, tail));
9311

9312 9313
	/* should fit into one byte */
	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
9314 9315
	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
9316

9317
	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
9318
	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
9319

9320 9321
	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));

9322 9323
	for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
		BUG_ON(!io_op_defs[i].prep);
9324 9325
		if (io_op_defs[i].prep != io_eopnotsupp_prep)
			BUG_ON(!io_op_defs[i].issue);
9326 9327
	}

9328 9329
	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
				SLAB_ACCOUNT);
J
Jens Axboe 已提交
9330 9331 9332
	return 0;
};
__initcall(io_uring_init);