io_uring.c 100.2 KB
Newer Older
J
Jens Axboe 已提交
1 2 3 4 5 6
// SPDX-License-Identifier: GPL-2.0
/*
 * Shared application/kernel submission and completion ring pairs, for
 * supporting fast/efficient IO.
 *
 * A note on the read/write ordering memory barriers that are matched between
S
Stefan Bühler 已提交
7 8 9 10 11 12 13
 * the application and kernel side.
 *
 * After the application reads the CQ ring tail, it must use an
 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
 * before writing the tail (using smp_load_acquire to read the tail will
 * do). It also needs a smp_mb() before updating CQ head (ordering the
 * entry load(s) with the head store), pairing with an implicit barrier
P
Pavel Begunkov 已提交
14
 * through a control-dependency in io_get_cqe (smp_store_release to
S
Stefan Bühler 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28
 * store head will do). Failure to do so could lead to reading invalid
 * CQ entries.
 *
 * Likewise, the application must use an appropriate smp_wmb() before
 * writing the SQ tail (ordering SQ entry stores with the tail store),
 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
 * to store the tail will do). And it needs a barrier ordering the SQ
 * head load before writing new SQ entries (smp_load_acquire to read
 * head will do).
 *
 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
 * updating the SQ tail; a full memory barrier smp_mb() is needed
 * between.
J
Jens Axboe 已提交
29 30 31 32 33 34 35 36 37 38 39
 *
 * Also see the examples in the liburing library:
 *
 *	git://git.kernel.dk/liburing
 *
 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
 * from data shared between the kernel and application. This is done both
 * for ordering purposes, but also to ensure that once a value is loaded from
 * data that the application could potentially modify, it remains stable.
 *
 * Copyright (C) 2018-2019 Jens Axboe
C
Christoph Hellwig 已提交
40
 * Copyright (c) 2018-2019 Christoph Hellwig
J
Jens Axboe 已提交
41 42 43 44 45
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
46
#include <net/compat.h>
J
Jens Axboe 已提交
47 48
#include <linux/refcount.h>
#include <linux/uio.h>
49
#include <linux/bits.h>
J
Jens Axboe 已提交
50 51 52 53 54 55 56 57 58

#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
59
#include <linux/bvec.h>
J
Jens Axboe 已提交
60 61 62
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
J
Jens Axboe 已提交
63
#include <net/scm.h>
J
Jens Axboe 已提交
64 65 66 67
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
68
#include <linux/highmem.h>
69
#include <linux/fsnotify.h>
J
Jens Axboe 已提交
70
#include <linux/fadvise.h>
71
#include <linux/task_work.h>
72
#include <linux/io_uring.h>
73
#include <linux/audit.h>
74
#include <linux/security.h>
J
Jens Axboe 已提交
75

76 77 78
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>

J
Jens Axboe 已提交
79 80
#include <uapi/linux/io_uring.h>

81
#include "io-wq.h"
J
Jens Axboe 已提交
82

J
Jens Axboe 已提交
83
#include "io_uring_types.h"
84
#include "io_uring.h"
85
#include "opdef.h"
86
#include "refs.h"
87
#include "tctx.h"
88
#include "sqpoll.h"
89
#include "fdinfo.h"
90
#include "kbuf.h"
91
#include "rsrc.h"
J
Jens Axboe 已提交
92

93
#include "timeout.h"
94
#include "poll.h"
95

96
#define IORING_MAX_ENTRIES	32768
97
#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
98

99 100
#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
				 IORING_REGISTER_LAST + IORING_OP_LAST)
J
Jens Axboe 已提交
101

102 103 104
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)

105 106
#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
107

108
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
109 110
				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
				REQ_F_ASYNC_DATA)
111

112 113 114
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
				 IO_REQ_CLEAN_FLAGS)

115 116
#define IO_TCTX_REFS_CACHE_NR	(1U << 10)

117
#define IO_COMPL_BATCH			32
118
#define IO_REQ_CACHE_SIZE		32
P
Pavel Begunkov 已提交
119
#define IO_REQ_ALLOC_BATCH		8
120

121 122
enum {
	IO_CHECK_CQ_OVERFLOW_BIT,
123
	IO_CHECK_CQ_DROPPED_BIT,
124 125
};

126 127 128
struct io_defer_entry {
	struct list_head	list;
	struct io_kiocb		*req;
129
	u32			seq;
J
Jens Axboe 已提交
130 131
};

132 133
/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
134
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
135

136 137
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
					 struct task_struct *task,
138
					 bool cancel_all);
139

140
static void io_dismantle_req(struct io_kiocb *req);
141
static void io_clean_op(struct io_kiocb *req);
P
Pavel Begunkov 已提交
142
static void io_queue_sqe(struct io_kiocb *req);
143

144
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
145

146
static void io_eventfd_signal(struct io_ring_ctx *ctx);
147

J
Jens Axboe 已提交
148 149 150 151 152
static struct kmem_cache *req_cachep;

struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
153
	if (io_is_uring_fops(file)) {
J
Jens Axboe 已提交
154 155 156 157 158 159 160 161 162
		struct io_ring_ctx *ctx = file->private_data;

		return ctx->ring_sock->sk;
	}
#endif
	return NULL;
}
EXPORT_SYMBOL(io_uring_get_socket);

163 164
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
165
	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
166 167 168
		__io_submit_flush_completions(ctx);
}

169 170 171 172 173 174 175 176 177
static bool io_match_linked(struct io_kiocb *head)
{
	struct io_kiocb *req;

	io_for_each_link(req, head) {
		if (req->flags & REQ_F_INFLIGHT)
			return true;
	}
	return false;
178 179 180 181 182 183
}

/*
 * As io_match_task() but protected against racing with linked timeouts.
 * User must not hold timeout_lock.
 */
184 185
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
			bool cancel_all)
186
{
187 188
	bool matched;

189 190
	if (task && head->task != task)
		return false;
191 192 193 194 195 196 197 198 199 200 201 202 203 204
	if (cancel_all)
		return true;

	if (head->flags & REQ_F_LINK_TIMEOUT) {
		struct io_ring_ctx *ctx = head->ctx;

		/* protect against races with linked timeouts */
		spin_lock_irq(&ctx->timeout_lock);
		matched = io_match_linked(head);
		spin_unlock_irq(&ctx->timeout_lock);
	} else {
		matched = io_match_linked(head);
	}
	return matched;
205 206
}

207 208 209
static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
	req_set_fail(req);
210
	io_req_set_res(req, res, 0);
211 212
}

213 214 215
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
216 217
}

P
Pavel Begunkov 已提交
218
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
J
Jens Axboe 已提交
219 220 221
{
	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);

222
	complete(&ctx->ref_comp);
J
Jens Axboe 已提交
223 224
}

P
Pavel Begunkov 已提交
225
static __cold void io_fallback_req_func(struct work_struct *work)
226 227 228 229 230
{
	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
						fallback_work.work);
	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
	struct io_kiocb *req, *tmp;
231
	bool locked = false;
232 233 234

	percpu_ref_get(&ctx->refs);
	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
235
		req->io_task_work.func(req, &locked);
236

237
	if (locked) {
238
		io_submit_flush_completions(ctx);
239 240
		mutex_unlock(&ctx->uring_lock);
	}
241 242 243
	percpu_ref_put(&ctx->refs);
}

P
Pavel Begunkov 已提交
244
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
J
Jens Axboe 已提交
245 246
{
	struct io_ring_ctx *ctx;
247
	int hash_bits;
J
Jens Axboe 已提交
248 249 250 251 252

	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
	if (!ctx)
		return NULL;

253 254
	xa_init(&ctx->io_bl_xa);

255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
	/*
	 * Use 5 bits less than the max cq entries, that should give us around
	 * 32 entries per hash list if totally full and uniformly spread.
	 */
	hash_bits = ilog2(p->cq_entries);
	hash_bits -= 5;
	if (hash_bits <= 0)
		hash_bits = 1;
	ctx->cancel_hash_bits = hash_bits;
	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
					GFP_KERNEL);
	if (!ctx->cancel_hash)
		goto err;
	__hash_init(ctx->cancel_hash, 1U << hash_bits);

270 271 272 273 274 275
	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
	if (!ctx->dummy_ubuf)
		goto err;
	/* set invalid range, so io_import_fixed() fails meeting it */
	ctx->dummy_ubuf->ubuf = -1UL;

276
	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
277 278
			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
		goto err;
J
Jens Axboe 已提交
279 280

	ctx->flags = p->flags;
281
	init_waitqueue_head(&ctx->sqo_sq_wait);
282
	INIT_LIST_HEAD(&ctx->sqd_list);
283
	INIT_LIST_HEAD(&ctx->cq_overflow_list);
284
	INIT_LIST_HEAD(&ctx->io_buffers_cache);
285
	INIT_LIST_HEAD(&ctx->apoll_cache);
286
	init_completion(&ctx->ref_comp);
287
	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
J
Jens Axboe 已提交
288
	mutex_init(&ctx->uring_lock);
P
Pavel Begunkov 已提交
289
	init_waitqueue_head(&ctx->cq_wait);
J
Jens Axboe 已提交
290
	spin_lock_init(&ctx->completion_lock);
291
	spin_lock_init(&ctx->timeout_lock);
292
	INIT_WQ_LIST(&ctx->iopoll_list);
293 294
	INIT_LIST_HEAD(&ctx->io_buffers_pages);
	INIT_LIST_HEAD(&ctx->io_buffers_comp);
295
	INIT_LIST_HEAD(&ctx->defer_list);
J
Jens Axboe 已提交
296
	INIT_LIST_HEAD(&ctx->timeout_list);
297
	INIT_LIST_HEAD(&ctx->ltimeout_list);
298 299
	spin_lock_init(&ctx->rsrc_ref_lock);
	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
300 301
	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
	init_llist_head(&ctx->rsrc_put_llist);
302
	INIT_LIST_HEAD(&ctx->tctx_list);
303 304
	ctx->submit_state.free_list.next = NULL;
	INIT_WQ_LIST(&ctx->locked_free_list);
305
	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
306
	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
J
Jens Axboe 已提交
307
	return ctx;
308
err:
309
	kfree(ctx->dummy_ubuf);
310
	kfree(ctx->cancel_hash);
311 312
	kfree(ctx->io_bl);
	xa_destroy(&ctx->io_bl_xa);
313 314
	kfree(ctx);
	return NULL;
J
Jens Axboe 已提交
315 316
}

317 318 319 320 321 322 323 324
static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{
	struct io_rings *r = ctx->rings;

	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
	ctx->cq_extra--;
}

325
static bool req_need_defer(struct io_kiocb *req, u32 seq)
326
{
327 328
	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
		struct io_ring_ctx *ctx = req->ctx;
329

330
		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
331
	}
332

B
Bob Liu 已提交
333
	return false;
334 335
}

336 337 338 339
static inline void io_req_track_inflight(struct io_kiocb *req)
{
	if (!(req->flags & REQ_F_INFLIGHT)) {
		req->flags |= REQ_F_INFLIGHT;
340
		atomic_inc(&req->task->io_uring->inflight_tracked);
341 342 343
	}
}

344 345
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
346 347 348
	if (WARN_ON_ONCE(!req->link))
		return NULL;

349 350
	req->flags &= ~REQ_F_ARM_LTIMEOUT;
	req->flags |= REQ_F_LINK_TIMEOUT;
351 352

	/* linked timeouts should have two refs once prep'ed */
353
	io_req_set_refcount(req);
354 355
	__io_req_set_refcount(req->link, 2);
	return req->link;
356 357 358 359
}

static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
360
	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
361 362 363 364
		return NULL;
	return __io_prep_linked_timeout(req);
}

365 366 367 368 369 370 371 372 373 374 375
static noinline void __io_arm_ltimeout(struct io_kiocb *req)
{
	io_queue_linked_timeout(__io_prep_linked_timeout(req));
}

static inline void io_arm_ltimeout(struct io_kiocb *req)
{
	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
		__io_arm_ltimeout(req);
}

J
Jens Axboe 已提交
376 377 378 379 380
static void io_prep_async_work(struct io_kiocb *req)
{
	const struct io_op_def *def = &io_op_defs[req->opcode];
	struct io_ring_ctx *ctx = req->ctx;

381 382
	if (!(req->flags & REQ_F_CREDS)) {
		req->flags |= REQ_F_CREDS;
383
		req->creds = get_current_cred();
384
	}
385

386 387
	req->work.list.next = NULL;
	req->work.flags = 0;
388
	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
389 390 391
	if (req->flags & REQ_F_FORCE_ASYNC)
		req->work.flags |= IO_WQ_WORK_CONCURRENT;

J
Jens Axboe 已提交
392 393 394
	if (req->flags & REQ_F_ISREG) {
		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
			io_wq_hash_work(&req->work, file_inode(req->file));
395
	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
J
Jens Axboe 已提交
396 397 398
		if (def->unbound_nonreg_file)
			req->work.flags |= IO_WQ_WORK_UNBOUND;
	}
399
}
400

401
static void io_prep_async_link(struct io_kiocb *req)
402
{
403
	struct io_kiocb *cur;
404

405 406 407
	if (req->flags & REQ_F_LINK_TIMEOUT) {
		struct io_ring_ctx *ctx = req->ctx;

408
		spin_lock_irq(&ctx->timeout_lock);
409 410
		io_for_each_link(cur, req)
			io_prep_async_work(cur);
411
		spin_unlock_irq(&ctx->timeout_lock);
412 413 414 415
	} else {
		io_for_each_link(cur, req)
			io_prep_async_work(cur);
	}
416 417
}

418
void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
419
{
420
	struct io_kiocb *link = io_prep_linked_timeout(req);
421
	struct io_uring_task *tctx = req->task->io_uring;
422

423 424
	BUG_ON(!tctx);
	BUG_ON(!tctx->io_wq);
425

426 427
	/* init ->work of the whole link before punting */
	io_prep_async_link(req);
428 429 430 431 432 433 434 435 436 437 438

	/*
	 * Not expected to happen, but if we do have a bug where this _can_
	 * happen, catch it here and ensure the request is marked as
	 * canceled. That will make io-wq go through the usual work cancel
	 * procedure rather than attempt to run this request (or create a new
	 * worker for it).
	 */
	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
		req->work.flags |= IO_WQ_WORK_CANCEL;

439 440 441
	trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
					req->opcode, req->flags, &req->work,
					io_wq_is_hashed(&req->work));
442
	io_wq_enqueue(tctx->io_wq, &req->work);
443 444
	if (link)
		io_queue_linked_timeout(link);
445 446
}

P
Pavel Begunkov 已提交
447
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
448
{
449
	while (!list_empty(&ctx->defer_list)) {
450 451
		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
						struct io_defer_entry, list);
452

453
		if (req_need_defer(de->req, de->seq))
454
			break;
455
		list_del_init(&de->list);
456
		io_req_task_queue(de->req);
457
		kfree(de);
458
	}
459 460
}

461
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
462
{
463 464 465 466 467 468 469 470 471 472 473
	if (ctx->off_timeout_used || ctx->drain_active) {
		spin_lock(&ctx->completion_lock);
		if (ctx->off_timeout_used)
			io_flush_timeouts(ctx);
		if (ctx->drain_active)
			io_queue_deferred(ctx);
		io_commit_cqring(ctx);
		spin_unlock(&ctx->completion_lock);
	}
	if (ctx->has_evfd)
		io_eventfd_signal(ctx);
474 475
}

476
static void io_eventfd_signal(struct io_ring_ctx *ctx)
477
{
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
	struct io_ev_fd *ev_fd;

	rcu_read_lock();
	/*
	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
	 * and eventfd_signal
	 */
	ev_fd = rcu_dereference(ctx->io_ev_fd);

	/*
	 * Check again if ev_fd exists incase an io_eventfd_unregister call
	 * completed between the NULL check of ctx->io_ev_fd at the start of
	 * the function and rcu_read_lock.
	 */
	if (unlikely(!ev_fd))
		goto out;
494
	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
495 496
		goto out;

497
	if (!ev_fd->eventfd_async || io_wq_current_is_worker())
498 499 500
		eventfd_signal(ev_fd->cq_ev_fd, 1);
out:
	rcu_read_unlock();
501 502
}

503 504 505 506 507 508 509
/*
 * This should only get called when at least one event has been posted.
 * Some applications rely on the eventfd notification count only changing
 * IFF a new CQE has been added to the CQ ring. There's no depedency on
 * 1:1 relationship between how many times this function is called (and
 * hence the eventfd count) and number of CQEs posted to the CQ ring.
 */
510
void io_cqring_ev_posted(struct io_ring_ctx *ctx)
511
{
512 513
	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
		     ctx->has_evfd))
514 515
		__io_commit_cqring_flush(ctx);

516
	io_cqring_wake(ctx);
517 518
}

519
/* Returns true if there are no backlogged entries after the flush */
520
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
521
{
522
	bool all_flushed, posted;
523
	size_t cqe_size = sizeof(struct io_uring_cqe);
524

525
	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
526
		return false;
527

528 529 530
	if (ctx->flags & IORING_SETUP_CQE32)
		cqe_size <<= 1;

531
	posted = false;
532
	spin_lock(&ctx->completion_lock);
533
	while (!list_empty(&ctx->cq_overflow_list)) {
P
Pavel Begunkov 已提交
534
		struct io_uring_cqe *cqe = io_get_cqe(ctx);
535
		struct io_overflow_cqe *ocqe;
536

537 538
		if (!cqe && !force)
			break;
539 540 541
		ocqe = list_first_entry(&ctx->cq_overflow_list,
					struct io_overflow_cqe, list);
		if (cqe)
542
			memcpy(cqe, &ocqe->cqe, cqe_size);
543
		else
544 545
			io_account_cq_overflow(ctx);

546
		posted = true;
547 548
		list_del(&ocqe->list);
		kfree(ocqe);
549 550
	}

551 552
	all_flushed = list_empty(&ctx->cq_overflow_list);
	if (all_flushed) {
553
		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
554
		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
555
	}
556

557
	io_commit_cqring(ctx);
558
	spin_unlock(&ctx->completion_lock);
559 560
	if (posted)
		io_cqring_ev_posted(ctx);
561
	return all_flushed;
562 563
}

564
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
565
{
566 567
	bool ret = true;

568
	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
569 570 571
		/* iopoll syncs against uring_lock, not completion_lock */
		if (ctx->flags & IORING_SETUP_IOPOLL)
			mutex_lock(&ctx->uring_lock);
572
		ret = __io_cqring_overflow_flush(ctx, false);
573 574 575
		if (ctx->flags & IORING_SETUP_IOPOLL)
			mutex_unlock(&ctx->uring_lock);
	}
576 577

	return ret;
578 579
}

580
static void __io_put_task(struct task_struct *task, int nr)
581 582 583
{
	struct io_uring_task *tctx = task->io_uring;

584 585 586 587 588 589 590 591 592 593 594 595 596
	percpu_counter_sub(&tctx->inflight, nr);
	if (unlikely(atomic_read(&tctx->in_idle)))
		wake_up(&tctx->wait);
	put_task_struct_many(task, nr);
}

/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
	if (likely(task == current))
		task->io_uring->cached_refs += nr;
	else
		__io_put_task(task, nr);
597 598
}

599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
static void io_task_refs_refill(struct io_uring_task *tctx)
{
	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;

	percpu_counter_add(&tctx->inflight, refill);
	refcount_add(refill, &current->usage);
	tctx->cached_refs += refill;
}

static inline void io_get_task_refs(int nr)
{
	struct io_uring_task *tctx = current->io_uring;

	tctx->cached_refs -= nr;
	if (unlikely(tctx->cached_refs < 0))
		io_task_refs_refill(tctx);
}

617 618 619 620 621 622 623 624 625 626 627 628
static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
	struct io_uring_task *tctx = task->io_uring;
	unsigned int refs = tctx->cached_refs;

	if (refs) {
		tctx->cached_refs = 0;
		percpu_counter_sub(&tctx->inflight, refs);
		put_task_struct_many(task, refs);
	}
}

629 630
bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
			      u32 cflags, u64 extra1, u64 extra2)
J
Jens Axboe 已提交
631
{
632
	struct io_overflow_cqe *ocqe;
633 634
	size_t ocq_size = sizeof(struct io_overflow_cqe);
	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
J
Jens Axboe 已提交
635

636 637
	if (is_cqe32)
		ocq_size += sizeof(struct io_uring_cqe);
J
Jens Axboe 已提交
638

639
	ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
D
Dylan Yudaken 已提交
640
	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
641 642 643 644 645 646
	if (!ocqe) {
		/*
		 * If we're in ring overflow flush mode, or in task cancel mode,
		 * or cannot allocate an overflow entry, then we need to drop it
		 * on the floor.
		 */
647
		io_account_cq_overflow(ctx);
648
		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
649
		return false;
J
Jens Axboe 已提交
650
	}
651
	if (list_empty(&ctx->cq_overflow_list)) {
652
		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
653
		atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
654

655
	}
656
	ocqe->cqe.user_data = user_data;
657 658
	ocqe->cqe.res = res;
	ocqe->cqe.flags = cflags;
659 660 661 662
	if (is_cqe32) {
		ocqe->cqe.big_cqe[0] = extra1;
		ocqe->cqe.big_cqe[1] = extra2;
	}
663 664
	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
	return true;
J
Jens Axboe 已提交
665 666
}

667 668
bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
		     u32 cflags)
669
{
670 671
	struct io_uring_cqe *cqe;

672
	ctx->cq_extra++;
673
	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
674 675 676 677 678 679 680 681 682 683 684

	/*
	 * If we can't get a cq entry, userspace overflowed the
	 * submission (by quite a lot). Increment the overflow count in
	 * the ring.
	 */
	cqe = io_get_cqe(ctx);
	if (likely(cqe)) {
		WRITE_ONCE(cqe->user_data, user_data);
		WRITE_ONCE(cqe->res, res);
		WRITE_ONCE(cqe->flags, cflags);
685 686 687 688 689

		if (ctx->flags & IORING_SETUP_CQE32) {
			WRITE_ONCE(cqe->big_cqe[0], 0);
			WRITE_ONCE(cqe->big_cqe[1], 0);
		}
690 691 692
		return true;
	}
	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
693 694
}

695
static void __io_req_complete_put(struct io_kiocb *req)
J
Jens Axboe 已提交
696
{
697 698 699 700
	/*
	 * If we're the last reference to this request, add to our locked
	 * free_list cache.
	 */
701
	if (req_ref_put_and_test(req)) {
702 703
		struct io_ring_ctx *ctx = req->ctx;

704
		if (req->flags & IO_REQ_LINK_FLAGS) {
705
			if (req->flags & IO_DISARM_MASK)
706 707 708 709 710 711
				io_disarm_next(req);
			if (req->link) {
				io_req_task_queue(req->link);
				req->link = NULL;
			}
		}
712
		io_req_put_rsrc(req);
713 714 715 716 717 718
		/*
		 * Selected buffer deallocation in io_clean_op() assumes that
		 * we don't hold ->completion_lock. Clean them here to avoid
		 * deadlocks.
		 */
		io_put_kbuf_comp(req);
719 720
		io_dismantle_req(req);
		io_put_task(req->task, 1);
721
		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
722
		ctx->locked_free_nr++;
723
	}
724 725
}

726
void __io_req_complete_post(struct io_kiocb *req)
727
{
728
	if (!(req->flags & REQ_F_CQE_SKIP))
729
		__io_fill_cqe_req(req->ctx, req);
730 731 732
	__io_req_complete_put(req);
}

733
void io_req_complete_post(struct io_kiocb *req)
734 735 736 737
{
	struct io_ring_ctx *ctx = req->ctx;

	spin_lock(&ctx->completion_lock);
738
	__io_req_complete_post(req);
739
	io_commit_cqring(ctx);
740
	spin_unlock(&ctx->completion_lock);
741
	io_cqring_ev_posted(ctx);
742 743
}

744
inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
745
{
746
	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
747
		req->flags |= REQ_F_COMPLETE_INLINE;
748 749
	else
		io_req_complete_post(req);
750 751
}

752
void io_req_complete_failed(struct io_kiocb *req, s32 res)
753
{
754
	req_set_fail(req);
755 756
	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
	io_req_complete_post(req);
757 758
}

P
Pavel Begunkov 已提交
759 760 761 762 763 764 765 766 767 768
/*
 * Don't initialise the fields below on every allocation, but do that in
 * advance and keep them valid across allocations.
 */
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
	req->ctx = ctx;
	req->link = NULL;
	req->async_data = NULL;
	/* not necessary, but safer to zero */
769
	req->cqe.res = 0;
P
Pavel Begunkov 已提交
770 771
}

772
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
773
					struct io_submit_state *state)
774
{
775
	spin_lock(&ctx->completion_lock);
776
	wq_list_splice(&ctx->locked_free_list, &state->free_list);
777
	ctx->locked_free_nr = 0;
778
	spin_unlock(&ctx->completion_lock);
779 780
}

781
static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
782
{
783
	return !ctx->submit_state.free_list.next;
784 785
}

786 787 788 789 790 791
/*
 * A request might get retired back into the request caches even before opcode
 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 * Because of that, io_alloc_req() should be called only under ->uring_lock
 * and with extra caution to not get a request that is still worked on.
 */
P
Pavel Begunkov 已提交
792
static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
793
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
794
{
P
Pavel Begunkov 已提交
795
	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
796
	void *reqs[IO_REQ_ALLOC_BATCH];
P
Pavel Begunkov 已提交
797
	int ret, i;
798

799 800 801 802 803
	/*
	 * If we have more than a batch's worth of requests in our IRQ side
	 * locked cache, grab the lock and move them over to our submission
	 * side cache.
	 */
804
	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
805
		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
806
		if (!io_req_cache_empty(ctx))
807 808
			return true;
	}
809

810
	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
811

P
Pavel Begunkov 已提交
812 813 814 815 816
	/*
	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
	 * retry single alloc to be on the safe side.
	 */
	if (unlikely(ret <= 0)) {
817 818
		reqs[0] = kmem_cache_alloc(req_cachep, gfp);
		if (!reqs[0])
819
			return false;
P
Pavel Begunkov 已提交
820
		ret = 1;
J
Jens Axboe 已提交
821
	}
P
Pavel Begunkov 已提交
822

823
	percpu_ref_get_many(&ctx->refs, ret);
824
	for (i = 0; i < ret; i++) {
825
		struct io_kiocb *req = reqs[i];
826 827

		io_preinit_req(req, ctx);
828
		io_req_add_to_cache(req, ctx);
829
	}
830 831 832 833 834
	return true;
}

static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
{
835
	if (unlikely(io_req_cache_empty(ctx)))
836 837 838 839 840 841 842 843 844
		return __io_alloc_req_refill(ctx);
	return true;
}

static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
{
	struct io_wq_work_node *node;

	node = wq_stack_extract(&ctx->submit_state.free_list);
845
	return container_of(node, struct io_kiocb, comp_list);
J
Jens Axboe 已提交
846 847
}

P
Pavel Begunkov 已提交
848
static inline void io_dismantle_req(struct io_kiocb *req)
J
Jens Axboe 已提交
849
{
850
	unsigned int flags = req->flags;
851

852
	if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
853
		io_clean_op(req);
854 855
	if (!(flags & REQ_F_FIXED_FILE))
		io_put_file(req->file);
856 857
}

858
__cold void io_free_req(struct io_kiocb *req)
859
{
860
	struct io_ring_ctx *ctx = req->ctx;
861

862
	io_req_put_rsrc(req);
863
	io_dismantle_req(req);
864
	io_put_task(req->task, 1);
865

866
	spin_lock(&ctx->completion_lock);
867
	wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
868
	ctx->locked_free_nr++;
869
	spin_unlock(&ctx->completion_lock);
870 871
}

872 873 874 875 876 877 878
static void __io_req_find_next_prep(struct io_kiocb *req)
{
	struct io_ring_ctx *ctx = req->ctx;
	bool posted;

	spin_lock(&ctx->completion_lock);
	posted = io_disarm_next(req);
879
	io_commit_cqring(ctx);
880 881 882 883 884 885
	spin_unlock(&ctx->completion_lock);
	if (posted)
		io_cqring_ev_posted(ctx);
}

static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
886
{
887
	struct io_kiocb *nxt;
888

J
Jens Axboe 已提交
889 890 891 892 893 894
	/*
	 * If LINK is set, we have dependent requests in this chain. If we
	 * didn't fail this request, queue the first one up, moving any other
	 * dependencies to the next request. In case of failure, fail the rest
	 * of the chain.
	 */
895 896
	if (unlikely(req->flags & IO_DISARM_MASK))
		__io_req_find_next_prep(req);
897 898 899
	nxt = req->link;
	req->link = NULL;
	return nxt;
900
}
J
Jens Axboe 已提交
901

902
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
903 904 905
{
	if (!ctx)
		return;
906 907
	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
908
	if (*locked) {
909
		io_submit_flush_completions(ctx);
910
		mutex_unlock(&ctx->uring_lock);
911
		*locked = false;
912 913 914 915
	}
	percpu_ref_put(&ctx->refs);
}

916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933
static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
{
	io_commit_cqring(ctx);
	spin_unlock(&ctx->completion_lock);
	io_cqring_ev_posted(ctx);
}

static void handle_prev_tw_list(struct io_wq_work_node *node,
				struct io_ring_ctx **ctx, bool *uring_locked)
{
	if (*ctx && !*uring_locked)
		spin_lock(&(*ctx)->completion_lock);

	do {
		struct io_wq_work_node *next = node->next;
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);

934 935
		prefetch(container_of(next, struct io_kiocb, io_task_work.node));

936 937 938 939 940 941 942 943 944 945 946 947
		if (req->ctx != *ctx) {
			if (unlikely(!*uring_locked && *ctx))
				ctx_commit_and_unlock(*ctx);

			ctx_flush_and_put(*ctx, uring_locked);
			*ctx = req->ctx;
			/* if not contended, grab and improve batching */
			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
			percpu_ref_get(&(*ctx)->refs);
			if (unlikely(!*uring_locked))
				spin_lock(&(*ctx)->completion_lock);
		}
948
		if (likely(*uring_locked)) {
949
			req->io_task_work.func(req, uring_locked);
950 951 952 953
		} else {
			req->cqe.flags = io_put_kbuf_comp(req);
			__io_req_complete_post(req);
		}
954 955 956 957 958 959 960 961 962
		node = next;
	} while (node);

	if (unlikely(!*uring_locked))
		ctx_commit_and_unlock(*ctx);
}

static void handle_tw_list(struct io_wq_work_node *node,
			   struct io_ring_ctx **ctx, bool *locked)
963 964 965 966 967 968
{
	do {
		struct io_wq_work_node *next = node->next;
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);

969 970
		prefetch(container_of(next, struct io_kiocb, io_task_work.node));

971 972 973 974 975 976 977 978 979 980 981 982
		if (req->ctx != *ctx) {
			ctx_flush_and_put(*ctx, locked);
			*ctx = req->ctx;
			/* if not contended, grab and improve batching */
			*locked = mutex_trylock(&(*ctx)->uring_lock);
			percpu_ref_get(&(*ctx)->refs);
		}
		req->io_task_work.func(req, locked);
		node = next;
	} while (node);
}

983
void tctx_task_work(struct callback_head *cb)
984
{
985
	bool uring_locked = false;
986
	struct io_ring_ctx *ctx = NULL;
987 988
	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
						  task_work);
989

990
	while (1) {
991
		struct io_wq_work_node *node1, *node2;
992 993

		spin_lock_irq(&tctx->task_lock);
994
		node1 = tctx->prio_task_list.first;
995
		node2 = tctx->task_list.first;
996
		INIT_WQ_LIST(&tctx->task_list);
997
		INIT_WQ_LIST(&tctx->prio_task_list);
998
		if (!node2 && !node1)
999
			tctx->task_running = false;
1000
		spin_unlock_irq(&tctx->task_lock);
1001
		if (!node2 && !node1)
1002
			break;
1003

1004 1005 1006 1007
		if (node1)
			handle_prev_tw_list(node1, &ctx, &uring_locked);
		if (node2)
			handle_tw_list(node2, &ctx, &uring_locked);
1008
		cond_resched();
1009

1010
		if (data_race(!tctx->task_list.first) &&
1011
		    data_race(!tctx->prio_task_list.first) && uring_locked)
1012
			io_submit_flush_completions(ctx);
1013
	}
1014

1015
	ctx_flush_and_put(ctx, &uring_locked);
1016 1017 1018 1019

	/* relaxed read is enough as only the task itself sets ->in_idle */
	if (unlikely(atomic_read(&tctx->in_idle)))
		io_uring_drop_tctx_refs(current);
1020 1021
}

1022 1023 1024
static void __io_req_task_work_add(struct io_kiocb *req,
				   struct io_uring_task *tctx,
				   struct io_wq_work_list *list)
1025
{
1026
	struct io_ring_ctx *ctx = req->ctx;
1027
	struct io_wq_work_node *node;
1028
	unsigned long flags;
1029
	bool running;
1030

1031
	spin_lock_irqsave(&tctx->task_lock, flags);
1032
	wq_list_add_tail(&req->io_task_work.node, list);
1033 1034 1035
	running = tctx->task_running;
	if (!running)
		tctx->task_running = true;
1036
	spin_unlock_irqrestore(&tctx->task_lock, flags);
1037 1038

	/* task_work already pending, we're done */
1039
	if (running)
1040
		return;
1041

1042 1043 1044
	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);

1045
	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
1046
		return;
1047

1048
	spin_lock_irqsave(&tctx->task_lock, flags);
1049
	tctx->task_running = false;
1050
	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
1051
	spin_unlock_irqrestore(&tctx->task_lock, flags);
1052

1053 1054 1055 1056 1057 1058 1059
	while (node) {
		req = container_of(node, struct io_kiocb, io_task_work.node);
		node = node->next;
		if (llist_add(&req->io_task_work.fallback_node,
			      &req->ctx->fallback_llist))
			schedule_delayed_work(&req->ctx->fallback_work, 1);
	}
1060 1061
}

1062
void io_req_task_work_add(struct io_kiocb *req)
1063 1064 1065 1066 1067 1068
{
	struct io_uring_task *tctx = req->task->io_uring;

	__io_req_task_work_add(req, tctx, &tctx->task_list);
}

1069
void io_req_task_prio_work_add(struct io_kiocb *req)
1070 1071 1072 1073 1074 1075 1076 1077 1078
{
	struct io_uring_task *tctx = req->task->io_uring;

	if (req->ctx->flags & IORING_SETUP_SQPOLL)
		__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
	else
		__io_req_task_work_add(req, tctx, &tctx->task_list);
}

1079
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
1080
{
1081
	io_req_complete_post(req);
1082
}
1083

1084
void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
1085
{
1086
	io_req_set_res(req, res, cflags);
1087
	req->io_task_work.func = io_req_tw_post;
1088
	io_req_task_work_add(req);
1089 1090
}

1091
static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
1092
{
1093
	/* not needed for normal modes, but SQPOLL depends on it */
1094
	io_tw_lock(req->ctx, locked);
1095
	io_req_complete_failed(req, req->cqe.res);
1096 1097
}

1098
void io_req_task_submit(struct io_kiocb *req, bool *locked)
1099
{
1100
	io_tw_lock(req->ctx, locked);
1101
	/* req->task == current here, checking PF_EXITING is safe */
1102
	if (likely(!(req->task->flags & PF_EXITING)))
P
Pavel Begunkov 已提交
1103
		io_queue_sqe(req);
1104
	else
1105
		io_req_complete_failed(req, -EFAULT);
1106 1107
}

1108
void io_req_task_queue_fail(struct io_kiocb *req, int ret)
1109
{
1110
	io_req_set_res(req, ret, 0);
1111
	req->io_task_work.func = io_req_task_cancel;
1112
	io_req_task_work_add(req);
1113 1114
}

1115
void io_req_task_queue(struct io_kiocb *req)
1116
{
1117
	req->io_task_work.func = io_req_task_submit;
1118
	io_req_task_work_add(req);
1119 1120
}

1121
void io_queue_next(struct io_kiocb *req)
1122
{
1123
	struct io_kiocb *nxt = io_req_find_next(req);
1124 1125

	if (nxt)
1126
		io_req_task_queue(nxt);
1127 1128
}

1129
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
1130
	__must_hold(&ctx->uring_lock)
1131
{
1132
	struct task_struct *task = NULL;
1133
	int task_refs = 0;
1134

1135 1136 1137
	do {
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    comp_list);
1138

1139 1140 1141 1142 1143 1144
		if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
			if (req->flags & REQ_F_REFCOUNT) {
				node = req->comp_list.next;
				if (!req_ref_put_and_test(req))
					continue;
			}
1145 1146 1147 1148 1149 1150 1151 1152 1153
			if ((req->flags & REQ_F_POLLED) && req->apoll) {
				struct async_poll *apoll = req->apoll;

				if (apoll->double_poll)
					kfree(apoll->double_poll);
				list_add(&apoll->poll.wait.entry,
						&ctx->apoll_cache);
				req->flags &= ~REQ_F_POLLED;
			}
1154
			if (req->flags & IO_REQ_LINK_FLAGS)
1155
				io_queue_next(req);
1156 1157
			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
				io_clean_op(req);
1158
		}
1159 1160
		if (!(req->flags & REQ_F_FIXED_FILE))
			io_put_file(req->file);
1161

1162
		io_req_put_rsrc_locked(req, ctx);
1163

1164 1165 1166 1167 1168 1169 1170
		if (req->task != task) {
			if (task)
				io_put_task(task, task_refs);
			task = req->task;
			task_refs = 0;
		}
		task_refs++;
1171
		node = req->comp_list.next;
1172
		io_req_add_to_cache(req, ctx);
1173
	} while (node);
1174 1175 1176

	if (task)
		io_put_task(task, task_refs);
1177 1178
}

1179
static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
1180
	__must_hold(&ctx->uring_lock)
1181
{
1182
	struct io_wq_work_node *node, *prev;
1183
	struct io_submit_state *state = &ctx->submit_state;
1184

1185 1186 1187 1188
	if (state->flush_cqes) {
		spin_lock(&ctx->completion_lock);
		wq_list_for_each(node, prev, &state->compl_reqs) {
			struct io_kiocb *req = container_of(node, struct io_kiocb,
1189
						    comp_list);
1190

1191 1192
			if (!(req->flags & REQ_F_CQE_SKIP))
				__io_fill_cqe_req(ctx, req);
1193 1194 1195 1196 1197 1198
		}

		io_commit_cqring(ctx);
		spin_unlock(&ctx->completion_lock);
		io_cqring_ev_posted(ctx);
		state->flush_cqes = false;
1199
	}
1200

1201
	io_free_batch_list(ctx, state->compl_reqs.first);
1202
	INIT_WQ_LIST(&state->compl_reqs);
1203 1204
}

1205 1206 1207 1208
/*
 * Drop reference to request, return next in chain (if there is one) if this
 * was the last reference to this request.
 */
1209
static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
1210
{
1211 1212
	struct io_kiocb *nxt = NULL;

1213
	if (req_ref_put_and_test(req)) {
1214
		if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
1215
			nxt = io_req_find_next(req);
P
Pavel Begunkov 已提交
1216
		io_free_req(req);
1217
	}
1218
	return nxt;
J
Jens Axboe 已提交
1219 1220
}

1221
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
1222 1223 1224
{
	/* See comment at the top of this file */
	smp_rmb();
1225
	return __io_cqring_events(ctx);
1226 1227
}

J
Jens Axboe 已提交
1228 1229 1230 1231
/*
 * We can't just wait for polled events to come to us, we have to actively
 * find and complete them.
 */
P
Pavel Begunkov 已提交
1232
static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
1233 1234 1235 1236 1237
{
	if (!(ctx->flags & IORING_SETUP_IOPOLL))
		return;

	mutex_lock(&ctx->uring_lock);
1238
	while (!wq_list_empty(&ctx->iopoll_list)) {
1239
		/* let it sleep and repeat later if can't complete a request */
1240
		if (io_do_iopoll(ctx, true) == 0)
1241
			break;
1242 1243 1244
		/*
		 * Ensure we allow local-to-the-cpu processing to take place,
		 * in this case we need to ensure that we reap all events.
1245
		 * Also let task_work, etc. to progress by releasing the mutex
1246
		 */
1247 1248 1249 1250 1251
		if (need_resched()) {
			mutex_unlock(&ctx->uring_lock);
			cond_resched();
			mutex_lock(&ctx->uring_lock);
		}
J
Jens Axboe 已提交
1252 1253 1254 1255
	}
	mutex_unlock(&ctx->uring_lock);
}

1256
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
J
Jens Axboe 已提交
1257
{
1258
	unsigned int nr_events = 0;
1259
	int ret = 0;
1260
	unsigned long check_cq;
1261

1262 1263 1264 1265 1266
	/*
	 * Don't enter poll loop if we already have events pending.
	 * If we do, we can potentially be spinning for commands that
	 * already triggered a CQE (eg in error).
	 */
1267 1268
	check_cq = READ_ONCE(ctx->check_cq);
	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
1269 1270
		__io_cqring_overflow_flush(ctx, false);
	if (io_cqring_events(ctx))
1271
		return 0;
1272 1273 1274 1275 1276 1277 1278 1279

	/*
	 * Similarly do not spin if we have not informed the user of any
	 * dropped CQE.
	 */
	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
		return -EBADR;

J
Jens Axboe 已提交
1280
	do {
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
		/*
		 * If a submit got punted to a workqueue, we can have the
		 * application entering polling for a command before it gets
		 * issued. That app will hold the uring_lock for the duration
		 * of the poll right here, so we need to take a breather every
		 * now and then to ensure that the issue has a chance to add
		 * the poll to the issued list. Otherwise we can spin here
		 * forever, while the workqueue is stuck trying to acquire the
		 * very same mutex.
		 */
1291
		if (wq_list_empty(&ctx->iopoll_list)) {
1292 1293
			u32 tail = ctx->cached_cq_tail;

1294
			mutex_unlock(&ctx->uring_lock);
1295
			io_run_task_work();
1296
			mutex_lock(&ctx->uring_lock);
J
Jens Axboe 已提交
1297

1298 1299
			/* some requests don't go through iopoll_list */
			if (tail != ctx->cached_cq_tail ||
1300
			    wq_list_empty(&ctx->iopoll_list))
1301
				break;
1302
		}
1303 1304 1305 1306 1307 1308
		ret = io_do_iopoll(ctx, !min);
		if (ret < 0)
			break;
		nr_events += ret;
		ret = 0;
	} while (nr_events < min && !need_resched());
1309

J
Jens Axboe 已提交
1310 1311
	return ret;
}
1312
inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
1313
{
1314
	if (*locked) {
1315
		req->cqe.flags |= io_put_kbuf(req, 0);
1316
		req->flags |= REQ_F_COMPLETE_INLINE;
1317
		io_req_add_compl_list(req);
1318
	} else {
1319 1320
		req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED);
		io_req_complete_post(req);
1321
	}
1322 1323
}

J
Jens Axboe 已提交
1324 1325 1326
/*
 * After the iocb has been issued, it's safe to be found on the poll list.
 * Adding the kiocb to the list AFTER submission ensures that we don't
1327
 * find it from a io_do_iopoll() thread before the issuer is done
J
Jens Axboe 已提交
1328 1329
 * accessing the kiocb cookie.
 */
1330
static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
J
Jens Axboe 已提交
1331 1332
{
	struct io_ring_ctx *ctx = req->ctx;
H
Hao Xu 已提交
1333
	const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
1334 1335

	/* workqueue context doesn't hold uring_lock, grab it now */
H
Hao Xu 已提交
1336
	if (unlikely(needs_lock))
1337
		mutex_lock(&ctx->uring_lock);
J
Jens Axboe 已提交
1338 1339 1340 1341 1342 1343

	/*
	 * Track whether we have multiple files in our lists. This will impact
	 * how we do polling eventually, not spinning if we're on potentially
	 * different devices.
	 */
1344
	if (wq_list_empty(&ctx->iopoll_list)) {
1345 1346
		ctx->poll_multi_queue = false;
	} else if (!ctx->poll_multi_queue) {
J
Jens Axboe 已提交
1347 1348
		struct io_kiocb *list_req;

1349 1350
		list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
					comp_list);
1351
		if (list_req->file != req->file)
1352
			ctx->poll_multi_queue = true;
J
Jens Axboe 已提交
1353 1354 1355 1356 1357 1358
	}

	/*
	 * For fast devices, IO may have already completed. If it has, add
	 * it to the front so we find it first.
	 */
1359
	if (READ_ONCE(req->iopoll_completed))
1360
		wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
J
Jens Axboe 已提交
1361
	else
1362
		wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
1363

H
Hao Xu 已提交
1364
	if (unlikely(needs_lock)) {
1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376
		/*
		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
		 * in sq thread task context or in io worker task context. If
		 * current task context is sq thread, we don't need to check
		 * whether should wake up sq thread.
		 */
		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
		    wq_has_sleeper(&ctx->sq_data->wait))
			wake_up(&ctx->sq_data->wait);

		mutex_unlock(&ctx->uring_lock);
	}
J
Jens Axboe 已提交
1377 1378
}

1379 1380
static bool io_bdev_nowait(struct block_device *bdev)
{
1381
	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
1382 1383
}

J
Jens Axboe 已提交
1384 1385 1386 1387 1388
/*
 * If we tracked the file through the SCM inflight mechanism, we could support
 * any file. For now, just ensure that anything potentially problematic is done
 * inline.
 */
1389
static bool __io_file_supports_nowait(struct file *file, umode_t mode)
J
Jens Axboe 已提交
1390
{
1391
	if (S_ISBLK(mode)) {
C
Christoph Hellwig 已提交
1392 1393
		if (IS_ENABLED(CONFIG_BLOCK) &&
		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
1394 1395 1396
			return true;
		return false;
	}
1397
	if (S_ISSOCK(mode))
J
Jens Axboe 已提交
1398
		return true;
1399
	if (S_ISREG(mode)) {
C
Christoph Hellwig 已提交
1400 1401
		if (IS_ENABLED(CONFIG_BLOCK) &&
		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
1402
		    !io_is_uring_fops(file))
1403 1404 1405
			return true;
		return false;
	}
J
Jens Axboe 已提交
1406

1407 1408 1409
	/* any ->read/write should understand O_NONBLOCK */
	if (file->f_flags & O_NONBLOCK)
		return true;
1410
	return file->f_mode & FMODE_NOWAIT;
J
Jens Axboe 已提交
1411
}
1412

1413 1414 1415 1416 1417
/*
 * If we tracked the file through the SCM inflight mechanism, we could support
 * any file. For now, just ensure that anything potentially problematic is done
 * inline.
 */
1418
unsigned int io_file_get_flags(struct file *file)
1419 1420 1421
{
	umode_t mode = file_inode(file)->i_mode;
	unsigned int res = 0;
1422

1423 1424 1425 1426
	if (S_ISREG(mode))
		res |= FFS_ISREG;
	if (__io_file_supports_nowait(file, mode))
		res |= FFS_NOWAIT;
1427 1428
	if (io_file_need_scm(file))
		res |= FFS_SCM;
1429
	return res;
J
Jens Axboe 已提交
1430 1431
}

1432
bool io_alloc_async_data(struct io_kiocb *req)
1433
{
1434 1435
	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
1436 1437 1438 1439 1440
	if (req->async_data) {
		req->flags |= REQ_F_ASYNC_DATA;
		return false;
	}
	return true;
1441 1442
}

1443
int io_req_prep_async(struct io_kiocb *req)
1444
{
1445 1446 1447 1448 1449
	const struct io_op_def *def = &io_op_defs[req->opcode];

	/* assign early for deferred execution for non-fixed file */
	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
		req->file = io_file_get_normal(req, req->cqe.fd);
1450
	if (!def->prep_async)
1451 1452 1453 1454 1455 1456
		return 0;
	if (WARN_ON_ONCE(req_has_async_data(req)))
		return -EFAULT;
	if (io_alloc_async_data(req))
		return -EAGAIN;

1457
	return def->prep_async(req);
1458 1459
}

1460 1461
static u32 io_get_sequence(struct io_kiocb *req)
{
1462
	u32 seq = req->ctx->cached_sq_head;
1463
	struct io_kiocb *cur;
1464

1465
	/* need original cached_sq_head, but it was increased for each req */
1466
	io_for_each_link(cur, req)
1467 1468
		seq--;
	return seq;
1469 1470
}

P
Pavel Begunkov 已提交
1471
static __cold void io_drain_req(struct io_kiocb *req)
1472
{
1473
	struct io_ring_ctx *ctx = req->ctx;
1474
	struct io_defer_entry *de;
1475
	int ret;
1476
	u32 seq = io_get_sequence(req);
1477

B
Bob Liu 已提交
1478
	/* Still need defer if there is pending req in defer list. */
1479
	spin_lock(&ctx->completion_lock);
1480
	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
1481
		spin_unlock(&ctx->completion_lock);
1482
queue:
1483
		ctx->drain_active = false;
1484 1485
		io_req_task_queue(req);
		return;
1486
	}
1487
	spin_unlock(&ctx->completion_lock);
1488

1489
	ret = io_req_prep_async(req);
1490 1491 1492 1493 1494
	if (ret) {
fail:
		io_req_complete_failed(req, ret);
		return;
	}
1495
	io_prep_async_link(req);
1496
	de = kmalloc(sizeof(*de), GFP_KERNEL);
1497
	if (!de) {
P
Pavel Begunkov 已提交
1498
		ret = -ENOMEM;
1499
		goto fail;
1500
	}
1501

1502
	spin_lock(&ctx->completion_lock);
1503
	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
1504
		spin_unlock(&ctx->completion_lock);
1505
		kfree(de);
1506
		goto queue;
1507 1508
	}

1509
	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
1510
	de->req = req;
1511
	de->seq = seq;
1512
	list_add_tail(&de->list, &ctx->defer_list);
1513
	spin_unlock(&ctx->completion_lock);
1514 1515
}

1516
static void io_clean_op(struct io_kiocb *req)
P
Pavel Begunkov 已提交
1517
{
1518 1519
	if (req->flags & REQ_F_BUFFER_SELECTED) {
		spin_lock(&req->ctx->completion_lock);
1520
		io_put_kbuf_comp(req);
1521 1522
		spin_unlock(&req->ctx->completion_lock);
	}
P
Pavel Begunkov 已提交
1523

1524
	if (req->flags & REQ_F_NEED_CLEANUP) {
1525
		const struct io_op_def *def = &io_op_defs[req->opcode];
1526

1527 1528
		if (def->cleanup)
			def->cleanup(req);
P
Pavel Begunkov 已提交
1529
	}
1530 1531 1532 1533 1534
	if ((req->flags & REQ_F_POLLED) && req->apoll) {
		kfree(req->apoll->double_poll);
		kfree(req->apoll);
		req->apoll = NULL;
	}
1535 1536 1537 1538 1539
	if (req->flags & REQ_F_INFLIGHT) {
		struct io_uring_task *tctx = req->task->io_uring;

		atomic_dec(&tctx->inflight_tracked);
	}
1540
	if (req->flags & REQ_F_CREDS)
1541
		put_cred(req->creds);
1542 1543 1544 1545
	if (req->flags & REQ_F_ASYNC_DATA) {
		kfree(req->async_data);
		req->async_data = NULL;
	}
1546
	req->flags &= ~IO_REQ_CLEAN_FLAGS;
P
Pavel Begunkov 已提交
1547 1548
}

J
Jens Axboe 已提交
1549 1550 1551 1552 1553 1554
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
{
	if (req->file || !io_op_defs[req->opcode].needs_file)
		return true;

	if (req->flags & REQ_F_FIXED_FILE)
1555
		req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
J
Jens Axboe 已提交
1556
	else
1557
		req->file = io_file_get_normal(req, req->cqe.fd);
J
Jens Axboe 已提交
1558

1559
	return !!req->file;
J
Jens Axboe 已提交
1560 1561
}

1562
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
J
Jens Axboe 已提交
1563
{
1564
	const struct io_op_def *def = &io_op_defs[req->opcode];
1565
	const struct cred *creds = NULL;
1566
	int ret;
J
Jens Axboe 已提交
1567

1568 1569 1570
	if (unlikely(!io_assign_file(req, issue_flags)))
		return -EBADF;

1571
	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
1572
		creds = override_creds(req->creds);
1573

1574
	if (!def->audit_skip)
1575 1576
		audit_uring_entry(req->opcode);

1577
	ret = def->issue(req, issue_flags);
J
Jens Axboe 已提交
1578

1579
	if (!def->audit_skip)
1580 1581
		audit_uring_exit(!ret, ret);

1582 1583
	if (creds)
		revert_creds(creds);
1584 1585 1586 1587

	if (ret == IOU_OK)
		__io_req_complete(req, issue_flags);
	else if (ret != IOU_ISSUE_SKIP_COMPLETE)
J
Jens Axboe 已提交
1588
		return ret;
1589

1590
	/* If the op doesn't have a file, we're not polling for it */
1591
	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
1592
		io_iopoll_req_issued(req, issue_flags);
J
Jens Axboe 已提交
1593 1594

	return 0;
J
Jens Axboe 已提交
1595 1596
}

1597 1598 1599 1600 1601
int io_poll_issue(struct io_kiocb *req, bool *locked)
{
	io_tw_lock(req->ctx, locked);
	if (unlikely(req->task->flags & PF_EXITING))
		return -EFAULT;
1602
	return io_issue_sqe(req, IO_URING_F_NONBLOCK);
1603 1604
}

1605
struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
P
Pavel Begunkov 已提交
1606 1607 1608 1609 1610 1611 1612
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);

	req = io_put_req_find_next(req);
	return req ? &req->work : NULL;
}

1613
void io_wq_submit_work(struct io_wq_work *work)
J
Jens Axboe 已提交
1614 1615
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
J
Jens Axboe 已提交
1616
	const struct io_op_def *def = &io_op_defs[req->opcode];
1617 1618
	unsigned int issue_flags = IO_URING_F_UNLOCKED;
	bool needs_poll = false;
J
Jens Axboe 已提交
1619
	int ret = 0, err = -ECANCELED;
J
Jens Axboe 已提交
1620

1621 1622 1623 1624 1625
	/* one will be dropped by ->io_free_work() after returning to io-wq */
	if (!(req->flags & REQ_F_REFCOUNT))
		__io_req_set_refcount(req, 2);
	else
		req_ref_get(req);
1626

1627
	io_arm_ltimeout(req);
J
Jens Axboe 已提交
1628

1629
	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
1630
	if (work->flags & IO_WQ_WORK_CANCEL) {
1631
fail:
J
Jens Axboe 已提交
1632
		io_req_task_queue_fail(req, err);
1633 1634
		return;
	}
1635 1636 1637 1638 1639
	if (!io_assign_file(req, issue_flags)) {
		err = -EBADF;
		work->flags |= IO_WQ_WORK_CANCEL;
		goto fail;
	}
1640

1641
	if (req->flags & REQ_F_FORCE_ASYNC) {
1642 1643 1644 1645
		bool opcode_poll = def->pollin || def->pollout;

		if (opcode_poll && file_can_poll(req->file)) {
			needs_poll = true;
1646
			issue_flags |= IO_URING_F_NONBLOCK;
1647
		}
1648
	}
1649

1650 1651 1652 1653 1654 1655 1656 1657 1658 1659
	do {
		ret = io_issue_sqe(req, issue_flags);
		if (ret != -EAGAIN)
			break;
		/*
		 * We can get EAGAIN for iopolled IO even though we're
		 * forcing a sync submission from here, since we can't
		 * wait for request slots on the block side.
		 */
		if (!needs_poll) {
1660 1661
			if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
				break;
1662 1663
			cond_resched();
			continue;
1664 1665
		}

1666
		if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
1667 1668 1669 1670 1671
			return;
		/* aborted or ready, in either case retry blocking */
		needs_poll = false;
		issue_flags &= ~IO_URING_F_NONBLOCK;
	} while (1);
1672

1673
	/* avoid locking problems by failing it from a clean context */
1674
	if (ret < 0)
1675
		io_req_task_queue_fail(req, ret);
J
Jens Axboe 已提交
1676 1677
}

1678 1679
inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
				      unsigned int issue_flags)
J
Jens Axboe 已提交
1680
{
1681 1682
	struct io_ring_ctx *ctx = req->ctx;
	struct file *file = NULL;
1683
	unsigned long file_ptr;
J
Jens Axboe 已提交
1684

1685
	io_ring_submit_lock(ctx, issue_flags);
1686

1687
	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
1688
		goto out;
1689 1690 1691 1692 1693
	fd = array_index_nospec(fd, ctx->nr_user_files);
	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
	file = (struct file *) (file_ptr & FFS_MASK);
	file_ptr &= ~FFS_MASK;
	/* mask in overlapping REQ_F and FFS bits */
1694
	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
1695
	io_req_set_rsrc_node(req, ctx, 0);
1696
	WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
1697
out:
1698
	io_ring_submit_unlock(ctx, issue_flags);
1699 1700
	return file;
}
1701

1702
struct file *io_file_get_normal(struct io_kiocb *req, int fd)
1703
{
1704
	struct file *file = fget(fd);
1705

1706
	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
J
Jens Axboe 已提交
1707

1708
	/* we don't allow fixed io_uring files */
1709
	if (file && io_is_uring_fops(file))
1710
		io_req_track_inflight(req);
P
Pavel Begunkov 已提交
1711
	return file;
J
Jens Axboe 已提交
1712 1713
}

1714
static void io_queue_async(struct io_kiocb *req, int ret)
1715 1716
	__must_hold(&req->ctx->uring_lock)
{
1717 1718 1719 1720 1721 1722 1723 1724
	struct io_kiocb *linked_timeout;

	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
		io_req_complete_failed(req, ret);
		return;
	}

	linked_timeout = io_prep_linked_timeout(req);
1725

1726
	switch (io_arm_poll_handler(req, 0)) {
1727 1728 1729 1730 1731 1732 1733 1734
	case IO_APOLL_READY:
		io_req_task_queue(req);
		break;
	case IO_APOLL_ABORTED:
		/*
		 * Queued up for async execution, worker will release
		 * submit reference when the iocb is actually submitted.
		 */
1735
		io_kbuf_recycle(req, 0);
1736
		io_queue_iowq(req, NULL);
1737
		break;
1738 1739
	case IO_APOLL_OK:
		break;
1740 1741 1742 1743 1744 1745
	}

	if (linked_timeout)
		io_queue_linked_timeout(linked_timeout);
}

P
Pavel Begunkov 已提交
1746
static inline void io_queue_sqe(struct io_kiocb *req)
1747
	__must_hold(&req->ctx->uring_lock)
J
Jens Axboe 已提交
1748
{
1749
	int ret;
J
Jens Axboe 已提交
1750

1751
	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
1752

1753 1754
	if (req->flags & REQ_F_COMPLETE_INLINE) {
		io_req_add_compl_list(req);
1755
		return;
1756
	}
1757 1758 1759 1760
	/*
	 * We async punt it if the file wasn't marked NOWAIT, or if the file
	 * doesn't support non-blocking read/write attempts
	 */
1761
	if (likely(!ret))
1762
		io_arm_ltimeout(req);
1763 1764
	else
		io_queue_async(req, ret);
J
Jens Axboe 已提交
1765 1766
}

1767
static void io_queue_sqe_fallback(struct io_kiocb *req)
1768
	__must_hold(&req->ctx->uring_lock)
1769
{
1770 1771 1772 1773 1774 1775 1776 1777
	if (unlikely(req->flags & REQ_F_FAIL)) {
		/*
		 * We don't submit, fail them all, for that replace hardlinks
		 * with normal links. Extra REQ_F_LINK is tolerated.
		 */
		req->flags &= ~REQ_F_HARDLINK;
		req->flags |= REQ_F_LINK;
		io_req_complete_failed(req, req->cqe.res);
1778 1779
	} else if (unlikely(req->ctx->drain_active)) {
		io_drain_req(req);
1780 1781 1782 1783 1784 1785
	} else {
		int ret = io_req_prep_async(req);

		if (unlikely(ret))
			io_req_complete_failed(req, ret);
		else
1786
			io_queue_iowq(req, NULL);
J
Jens Axboe 已提交
1787
	}
1788 1789
}

1790 1791 1792 1793 1794 1795 1796 1797
/*
 * Check SQE restrictions (opcode and flags).
 *
 * Returns 'true' if SQE is allowed, 'false' otherwise.
 */
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
					struct io_kiocb *req,
					unsigned int sqe_flags)
1798
{
1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810
	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
		return false;

	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
	    ctx->restrictions.sqe_flags_required)
		return false;

	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
			  ctx->restrictions.sqe_flags_required))
		return false;

	return true;
1811 1812
}

1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823
static void io_init_req_drain(struct io_kiocb *req)
{
	struct io_ring_ctx *ctx = req->ctx;
	struct io_kiocb *head = ctx->submit_state.link.head;

	ctx->drain_active = true;
	if (head) {
		/*
		 * If we need to drain a request in the middle of a link, drain
		 * the head request and the next request/link after the current
		 * link. Considering sequential execution of links,
1824
		 * REQ_F_IO_DRAIN will be maintained for every request of our
1825 1826
		 * link.
		 */
1827
		head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
1828 1829 1830 1831
		ctx->drain_next = true;
	}
}

1832 1833
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
		       const struct io_uring_sqe *sqe)
1834
	__must_hold(&ctx->uring_lock)
1835
{
1836
	const struct io_op_def *def;
1837
	unsigned int sqe_flags;
1838
	int personality;
1839
	u8 opcode;
1840

P
Pavel Begunkov 已提交
1841
	/* req is partially pre-initialised, see io_preinit_req() */
1842
	req->opcode = opcode = READ_ONCE(sqe->opcode);
1843 1844
	/* same numerical values with corresponding REQ_F_*, safe to copy */
	req->flags = sqe_flags = READ_ONCE(sqe->flags);
1845
	req->cqe.user_data = READ_ONCE(sqe->user_data);
1846
	req->file = NULL;
1847
	req->rsrc_node = NULL;
1848 1849
	req->task = current;

1850 1851
	if (unlikely(opcode >= IORING_OP_LAST)) {
		req->opcode = 0;
1852
		return -EINVAL;
1853
	}
1854
	def = &io_op_defs[opcode];
1855 1856 1857 1858
	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
		/* enforce forwards compatibility on users */
		if (sqe_flags & ~SQE_VALID_FLAGS)
			return -EINVAL;
1859
		if (sqe_flags & IOSQE_BUFFER_SELECT) {
1860
			if (!def->buffer_select)
1861 1862 1863
				return -EOPNOTSUPP;
			req->buf_index = READ_ONCE(sqe->buf_group);
		}
1864 1865 1866 1867 1868
		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
			ctx->drain_disabled = true;
		if (sqe_flags & IOSQE_IO_DRAIN) {
			if (ctx->drain_disabled)
				return -EOPNOTSUPP;
1869
			io_init_req_drain(req);
1870
		}
1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881
	}
	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
			return -EACCES;
		/* knock it to the slow queue path, will be drained there */
		if (ctx->drain_active)
			req->flags |= REQ_F_FORCE_ASYNC;
		/* if there is no link, we're at "next" request and need to drain */
		if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
			ctx->drain_next = false;
			ctx->drain_active = true;
1882
			req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
1883
		}
1884
	}
1885

1886
	if (!def->ioprio && sqe->ioprio)
1887
		return -EINVAL;
1888
	if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
1889 1890
		return -EINVAL;

1891
	if (def->needs_file) {
P
Pavel Begunkov 已提交
1892 1893
		struct io_submit_state *state = &ctx->submit_state;

1894
		req->cqe.fd = READ_ONCE(sqe->fd);
J
Jens Axboe 已提交
1895

P
Pavel Begunkov 已提交
1896 1897 1898 1899
		/*
		 * Plug now if we have more than 2 IO left after this, and the
		 * target is potentially a read/write to block based storage.
		 */
1900
		if (state->need_plug && def->plug) {
P
Pavel Begunkov 已提交
1901 1902
			state->plug_started = true;
			state->need_plug = false;
1903
			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
P
Pavel Begunkov 已提交
1904
		}
1905
	}
1906

1907 1908
	personality = READ_ONCE(sqe->personality);
	if (personality) {
1909 1910
		int ret;

1911 1912
		req->creds = xa_load(&ctx->personalities, personality);
		if (!req->creds)
1913
			return -EINVAL;
1914
		get_cred(req->creds);
1915 1916 1917 1918 1919
		ret = security_uring_override_creds(req->creds);
		if (ret) {
			put_cred(req->creds);
			return ret;
		}
1920
		req->flags |= REQ_F_CREDS;
1921
	}
1922

1923
	return def->prep(req, sqe);
1924 1925
}

1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963
static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
				      struct io_kiocb *req, int ret)
{
	struct io_ring_ctx *ctx = req->ctx;
	struct io_submit_link *link = &ctx->submit_state.link;
	struct io_kiocb *head = link->head;

	trace_io_uring_req_failed(sqe, ctx, req, ret);

	/*
	 * Avoid breaking links in the middle as it renders links with SQPOLL
	 * unusable. Instead of failing eagerly, continue assembling the link if
	 * applicable and mark the head with REQ_F_FAIL. The link flushing code
	 * should find the flag and handle the rest.
	 */
	req_fail_link_node(req, ret);
	if (head && !(head->flags & REQ_F_FAIL))
		req_fail_link_node(head, -ECANCELED);

	if (!(req->flags & IO_REQ_LINK_FLAGS)) {
		if (head) {
			link->last->link = req;
			link->head = NULL;
			req = head;
		}
		io_queue_sqe_fallback(req);
		return ret;
	}

	if (head)
		link->last->link = req;
	else
		link->head = req;
	link->last = req;
	return 0;
}

static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1964
			 const struct io_uring_sqe *sqe)
1965
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
1966
{
1967
	struct io_submit_link *link = &ctx->submit_state.link;
1968
	int ret;
J
Jens Axboe 已提交
1969

1970
	ret = io_init_req(ctx, req, sqe);
1971 1972
	if (unlikely(ret))
		return io_submit_fail_init(sqe, req, ret);
1973

1974
	/* don't need @sqe from now on */
1975
	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
1976 1977
				  req->flags, true,
				  ctx->flags & IORING_SETUP_SQPOLL);
1978

J
Jens Axboe 已提交
1979 1980 1981 1982 1983 1984 1985
	/*
	 * If we already have a head request, queue this one for async
	 * submittal once the head completes. If we don't have a head but
	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
	 * submitted sync once the chain is complete. If none of those
	 * conditions are true (normal request), then just queue it.
	 */
1986
	if (unlikely(link->head)) {
1987 1988 1989 1990 1991
		ret = io_req_prep_async(req);
		if (unlikely(ret))
			return io_submit_fail_init(sqe, req, ret);

		trace_io_uring_link(ctx, req, link->head);
1992
		link->last->link = req;
1993
		link->last = req;
1994

1995
		if (req->flags & IO_REQ_LINK_FLAGS)
1996
			return 0;
1997 1998
		/* last request of the link, flush it */
		req = link->head;
1999
		link->head = NULL;
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
		if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
			goto fallback;

	} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
					  REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
		if (req->flags & IO_REQ_LINK_FLAGS) {
			link->head = req;
			link->last = req;
		} else {
fallback:
			io_queue_sqe_fallback(req);
		}
2012
		return 0;
J
Jens Axboe 已提交
2013
	}
2014

2015
	io_queue_sqe(req);
2016
	return 0;
J
Jens Axboe 已提交
2017 2018
}

2019 2020 2021
/*
 * Batched submission is done, ensure local IO is flushed out.
 */
2022
static void io_submit_state_end(struct io_ring_ctx *ctx)
2023
{
2024 2025
	struct io_submit_state *state = &ctx->submit_state;

2026 2027
	if (unlikely(state->link.head))
		io_queue_sqe_fallback(state->link.head);
2028
	/* flush only after queuing links as they can generate completions */
2029
	io_submit_flush_completions(ctx);
J
Jens Axboe 已提交
2030 2031
	if (state->plug_started)
		blk_finish_plug(&state->plug);
2032 2033 2034 2035 2036 2037
}

/*
 * Start submission side cache.
 */
static void io_submit_state_start(struct io_submit_state *state,
2038
				  unsigned int max_ios)
2039
{
J
Jens Axboe 已提交
2040
	state->plug_started = false;
P
Pavel Begunkov 已提交
2041
	state->need_plug = max_ios > 2;
2042
	state->submit_nr = max_ios;
2043 2044
	/* set only head, no need to init link_last in advance */
	state->link.head = NULL;
2045 2046
}

J
Jens Axboe 已提交
2047 2048
static void io_commit_sqring(struct io_ring_ctx *ctx)
{
2049
	struct io_rings *rings = ctx->rings;
J
Jens Axboe 已提交
2050

2051 2052 2053 2054 2055 2056
	/*
	 * Ensure any loads from the SQEs are done at this point,
	 * since once we write the new head, the application could
	 * write new data to them.
	 */
	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
J
Jens Axboe 已提交
2057 2058 2059
}

/*
F
Fam Zheng 已提交
2060
 * Fetch an sqe, if one is available. Note this returns a pointer to memory
J
Jens Axboe 已提交
2061 2062 2063 2064 2065 2066
 * that is mapped by userspace. This means that care needs to be taken to
 * ensure that reads are stable, as we cannot rely on userspace always
 * being a good citizen. If members of the sqe are validated and then later
 * used, it's important that those reads are done through READ_ONCE() to
 * prevent a re-load down the line.
 */
2067
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2068
{
2069
	unsigned head, mask = ctx->sq_entries - 1;
P
Pavel Begunkov 已提交
2070
	unsigned sq_idx = ctx->cached_sq_head++ & mask;
J
Jens Axboe 已提交
2071 2072 2073 2074 2075 2076 2077 2078 2079

	/*
	 * The cached sq head (or cq tail) serves two purposes:
	 *
	 * 1) allows us to batch the cost of updating the user visible
	 *    head updates.
	 * 2) allows the kernel side to track the head on its own, even
	 *    though the application is the one updating it.
	 */
P
Pavel Begunkov 已提交
2080
	head = READ_ONCE(ctx->sq_array[sq_idx]);
2081 2082 2083 2084
	if (likely(head < ctx->sq_entries)) {
		/* double index for 128-byte SQEs, twice as long */
		if (ctx->flags & IORING_SETUP_SQE128)
			head <<= 1;
2085
		return &ctx->sq_sqes[head];
2086
	}
J
Jens Axboe 已提交
2087 2088

	/* drop invalid entries */
2089 2090 2091
	ctx->cq_extra--;
	WRITE_ONCE(ctx->rings->sq_dropped,
		   READ_ONCE(ctx->rings->sq_dropped) + 1);
2092 2093 2094
	return NULL;
}

2095
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
2096
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
2097
{
2098
	unsigned int entries = io_sqring_entries(ctx);
2099 2100
	unsigned int left;
	int ret;
J
Jens Axboe 已提交
2101

2102
	if (unlikely(!entries))
2103
		return 0;
2104
	/* make sure SQ entry isn't read before tail */
2105 2106 2107
	ret = left = min3(nr, ctx->sq_entries, entries);
	io_get_task_refs(left);
	io_submit_state_start(&ctx->submit_state, left);
J
Jens Axboe 已提交
2108

2109
	do {
2110
		const struct io_uring_sqe *sqe;
2111
		struct io_kiocb *req;
2112

2113
		if (unlikely(!io_alloc_req_refill(ctx)))
2114
			break;
2115
		req = io_alloc_req(ctx);
2116 2117
		sqe = io_get_sqe(ctx);
		if (unlikely(!sqe)) {
2118
			io_req_add_to_cache(req, ctx);
2119 2120
			break;
		}
J
Jens Axboe 已提交
2121

2122 2123 2124 2125 2126 2127 2128 2129
		/*
		 * Continue submitting even for sqe failure if the
		 * ring was setup with IORING_SETUP_SUBMIT_ALL
		 */
		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
			left--;
			break;
2130
		}
2131
	} while (--left);
2132

2133 2134 2135 2136 2137 2138
	if (unlikely(left)) {
		ret -= left;
		/* try again if it submitted nothing and can't allocate a req */
		if (!ret && io_req_cache_empty(ctx))
			ret = -EAGAIN;
		current->io_uring->cached_refs += left;
2139
	}
J
Jens Axboe 已提交
2140

2141
	io_submit_state_end(ctx);
2142 2143
	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
	io_commit_sqring(ctx);
2144
	return ret;
J
Jens Axboe 已提交
2145 2146
}

2147 2148 2149
struct io_wait_queue {
	struct wait_queue_entry wq;
	struct io_ring_ctx *ctx;
2150
	unsigned cq_tail;
2151 2152 2153
	unsigned nr_timeouts;
};

2154
static inline bool io_should_wake(struct io_wait_queue *iowq)
2155 2156
{
	struct io_ring_ctx *ctx = iowq->ctx;
2157
	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
2158 2159

	/*
2160
	 * Wake up if we have enough events, or if a timeout occurred since we
2161 2162 2163
	 * started waiting. For timeouts, we always want to return to userspace,
	 * regardless of event count.
	 */
2164
	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
2165 2166 2167 2168 2169 2170 2171 2172
}

static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
			    int wake_flags, void *key)
{
	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
							wq);

2173 2174 2175 2176
	/*
	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
	 * the task, and the next invocation will do it.
	 */
2177 2178
	if (io_should_wake(iowq) ||
	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
2179 2180
		return autoremove_wake_function(curr, mode, wake_flags, key);
	return -1;
2181 2182
}

2183
int io_run_task_work_sig(void)
2184 2185 2186
{
	if (io_run_task_work())
		return 1;
2187
	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
2188
		return -ERESTARTSYS;
2189 2190 2191
	if (task_sigpending(current))
		return -EINTR;
	return 0;
2192 2193
}

2194 2195 2196
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
					  struct io_wait_queue *iowq,
2197
					  ktime_t timeout)
2198 2199
{
	int ret;
2200
	unsigned long check_cq;
2201 2202 2203 2204 2205

	/* make sure we run task_work before checking for signals */
	ret = io_run_task_work_sig();
	if (ret || io_should_wake(iowq))
		return ret;
2206
	check_cq = READ_ONCE(ctx->check_cq);
2207
	/* let the caller flush overflows, retry */
2208
	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
2209
		return 1;
2210 2211
	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
		return -EBADR;
2212 2213 2214
	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
		return -ETIME;
	return 1;
2215 2216
}

J
Jens Axboe 已提交
2217 2218 2219 2220 2221
/*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
 */
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2222 2223
			  const sigset_t __user *sig, size_t sigsz,
			  struct __kernel_timespec __user *uts)
J
Jens Axboe 已提交
2224
{
2225
	struct io_wait_queue iowq;
2226
	struct io_rings *rings = ctx->rings;
2227
	ktime_t timeout = KTIME_MAX;
2228
	int ret;
J
Jens Axboe 已提交
2229

2230
	do {
2231
		io_cqring_overflow_flush(ctx);
2232
		if (io_cqring_events(ctx) >= min_events)
2233
			return 0;
2234
		if (!io_run_task_work())
2235 2236
			break;
	} while (1);
J
Jens Axboe 已提交
2237 2238

	if (sig) {
2239 2240 2241
#ifdef CONFIG_COMPAT
		if (in_compat_syscall())
			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
2242
						      sigsz);
2243 2244
		else
#endif
2245
			ret = set_user_sigmask(sig, sigsz);
2246

J
Jens Axboe 已提交
2247 2248 2249 2250
		if (ret)
			return ret;
	}

2251 2252 2253 2254 2255 2256 2257 2258
	if (uts) {
		struct timespec64 ts;

		if (get_timespec64(&ts, uts))
			return -EFAULT;
		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
	}

2259 2260 2261 2262
	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
	iowq.wq.private = current;
	INIT_LIST_HEAD(&iowq.wq.entry);
	iowq.ctx = ctx;
2263
	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
2264
	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
2265

2266
	trace_io_uring_cqring_wait(ctx, min_events);
2267
	do {
2268
		/* if we can't even flush overflow, don't wait for more */
2269
		if (!io_cqring_overflow_flush(ctx)) {
2270 2271 2272
			ret = -EBUSY;
			break;
		}
P
Pavel Begunkov 已提交
2273
		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
2274
						TASK_INTERRUPTIBLE);
2275
		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
2276
		cond_resched();
2277
	} while (ret > 0);
2278

2279
	finish_wait(&ctx->cq_wait, &iowq.wq);
2280
	restore_saved_sigmask_unless(ret == -EINTR);
J
Jens Axboe 已提交
2281

2282
	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
J
Jens Axboe 已提交
2283 2284
}

2285
static void io_mem_free(void *ptr)
2286
{
2287
	struct page *page;
2288

2289 2290
	if (!ptr)
		return;
2291

2292 2293 2294
	page = virt_to_head_page(ptr);
	if (put_page_testzero(page))
		free_compound_page(page);
2295 2296
}

2297
static void *io_mem_alloc(size_t size)
2298
{
2299
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
2300

2301
	return (void *) __get_free_pages(gfp, get_order(size));
2302 2303
}

2304 2305
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
				unsigned int cq_entries, size_t *sq_offset)
J
Jens Axboe 已提交
2306
{
2307 2308
	struct io_rings *rings;
	size_t off, sq_array_size;
J
Jens Axboe 已提交
2309

2310 2311 2312 2313 2314 2315 2316
	off = struct_size(rings, cqes, cq_entries);
	if (off == SIZE_MAX)
		return SIZE_MAX;
	if (ctx->flags & IORING_SETUP_CQE32) {
		if (check_shl_overflow(off, 1, &off))
			return SIZE_MAX;
	}
2317

2318 2319 2320 2321 2322
#ifdef CONFIG_SMP
	off = ALIGN(off, SMP_CACHE_BYTES);
	if (off == 0)
		return SIZE_MAX;
#endif
2323

2324 2325
	if (sq_offset)
		*sq_offset = off;
2326

2327 2328 2329
	sq_array_size = array_size(sizeof(u32), sq_entries);
	if (sq_array_size == SIZE_MAX)
		return SIZE_MAX;
J
Jens Axboe 已提交
2330

2331 2332
	if (check_add_overflow(off, sq_array_size, &off))
		return SIZE_MAX;
2333

2334
	return off;
2335 2336
}

2337 2338
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
			       unsigned int eventfd_async)
2339
{
2340 2341 2342
	struct io_ev_fd *ev_fd;
	__s32 __user *fds = arg;
	int fd;
2343

2344 2345 2346 2347
	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
					lockdep_is_held(&ctx->uring_lock));
	if (ev_fd)
		return -EBUSY;
2348

2349 2350
	if (copy_from_user(&fd, fds, sizeof(*fds)))
		return -EFAULT;
2351

2352 2353 2354
	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
	if (!ev_fd)
		return -ENOMEM;
2355

2356 2357 2358 2359 2360 2361 2362 2363 2364 2365
	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
	if (IS_ERR(ev_fd->cq_ev_fd)) {
		int ret = PTR_ERR(ev_fd->cq_ev_fd);
		kfree(ev_fd);
		return ret;
	}
	ev_fd->eventfd_async = eventfd_async;
	ctx->has_evfd = true;
	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
	return 0;
2366 2367
}

2368
static void io_eventfd_put(struct rcu_head *rcu)
2369
{
2370
	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
2371

2372 2373
	eventfd_ctx_put(ev_fd->cq_ev_fd);
	kfree(ev_fd);
2374 2375
}

2376
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
2377
{
2378 2379 2380 2381 2382 2383 2384 2385 2386 2387
	struct io_ev_fd *ev_fd;

	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
					lockdep_is_held(&ctx->uring_lock));
	if (ev_fd) {
		ctx->has_evfd = false;
		rcu_assign_pointer(ctx->io_ev_fd, NULL);
		call_rcu(&ev_fd->rcu, io_eventfd_put);
		return 0;
	}
2388

2389
	return -ENXIO;
2390 2391
}

2392
static void io_req_caches_free(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2393
{
2394
	struct io_submit_state *state = &ctx->submit_state;
2395
	int nr = 0;
P
Pavel Begunkov 已提交
2396

2397
	mutex_lock(&ctx->uring_lock);
2398
	io_flush_cached_locked_reqs(ctx, state);
2399

2400
	while (!io_req_cache_empty(ctx)) {
2401 2402
		struct io_wq_work_node *node;
		struct io_kiocb *req;
2403

2404 2405 2406
		node = wq_stack_extract(&state->free_list);
		req = container_of(node, struct io_kiocb, comp_list);
		kmem_cache_free(req_cachep, req);
2407
		nr++;
2408
	}
2409 2410
	if (nr)
		percpu_ref_put_many(&ctx->refs, nr);
2411 2412 2413
	mutex_unlock(&ctx->uring_lock);
}

2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425
static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
{
	struct async_poll *apoll;

	while (!list_empty(&ctx->apoll_cache)) {
		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
						poll.wait.entry);
		list_del(&apoll->poll.wait.entry);
		kfree(apoll);
	}
}

P
Pavel Begunkov 已提交
2426
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2427
{
2428
	io_sq_thread_finish(ctx);
2429

2430
	if (ctx->mm_account) {
2431 2432
		mmdrop(ctx->mm_account);
		ctx->mm_account = NULL;
2433
	}
J
Jens Axboe 已提交
2434

2435
	io_rsrc_refs_drop(ctx);
2436 2437 2438 2439
	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
	io_wait_rsrc_data(ctx->buf_data);
	io_wait_rsrc_data(ctx->file_data);

2440
	mutex_lock(&ctx->uring_lock);
2441
	if (ctx->buf_data)
2442
		__io_sqe_buffers_unregister(ctx);
2443
	if (ctx->file_data)
2444
		__io_sqe_files_unregister(ctx);
2445 2446
	if (ctx->rings)
		__io_cqring_overflow_flush(ctx, true);
2447
	io_eventfd_unregister(ctx);
2448
	io_flush_apoll_cache(ctx);
2449
	mutex_unlock(&ctx->uring_lock);
2450
	io_destroy_buffers(ctx);
2451 2452
	if (ctx->sq_creds)
		put_cred(ctx->sq_creds);
J
Jens Axboe 已提交
2453

P
Pavel Begunkov 已提交
2454 2455 2456
	/* there are no registered resources left, nobody uses it */
	if (ctx->rsrc_node)
		io_rsrc_node_destroy(ctx->rsrc_node);
2457
	if (ctx->rsrc_backup_node)
2458
		io_rsrc_node_destroy(ctx->rsrc_backup_node);
P
Pavel Begunkov 已提交
2459
	flush_delayed_work(&ctx->rsrc_put_work);
2460
	flush_delayed_work(&ctx->fallback_work);
P
Pavel Begunkov 已提交
2461 2462 2463

	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
J
Jens Axboe 已提交
2464

J
Jens Axboe 已提交
2465
#if defined(CONFIG_UNIX)
2466 2467
	if (ctx->ring_sock) {
		ctx->ring_sock->file = NULL; /* so that iput() is called */
J
Jens Axboe 已提交
2468
		sock_release(ctx->ring_sock);
2469
	}
J
Jens Axboe 已提交
2470
#endif
2471
	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
J
Jens Axboe 已提交
2472

2473
	io_mem_free(ctx->rings);
J
Jens Axboe 已提交
2474 2475 2476 2477
	io_mem_free(ctx->sq_sqes);

	percpu_ref_exit(&ctx->refs);
	free_uid(ctx->user);
2478
	io_req_caches_free(ctx);
2479 2480
	if (ctx->hash_map)
		io_wq_put_hash(ctx->hash_map);
2481
	kfree(ctx->cancel_hash);
2482
	kfree(ctx->dummy_ubuf);
2483 2484
	kfree(ctx->io_bl);
	xa_destroy(&ctx->io_bl_xa);
J
Jens Axboe 已提交
2485 2486 2487 2488 2489 2490 2491 2492
	kfree(ctx);
}

static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
	struct io_ring_ctx *ctx = file->private_data;
	__poll_t mask = 0;

2493
	poll_wait(file, &ctx->cq_wait, wait);
2494 2495 2496 2497
	/*
	 * synchronizes with barrier from wq_has_sleeper call in
	 * io_commit_cqring
	 */
J
Jens Axboe 已提交
2498
	smp_rmb();
2499
	if (!io_sqring_full(ctx))
J
Jens Axboe 已提交
2500
		mask |= EPOLLOUT | EPOLLWRNORM;
2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514

	/*
	 * Don't flush cqring overflow list here, just do a simple check.
	 * Otherwise there could possible be ABBA deadlock:
	 *      CPU0                    CPU1
	 *      ----                    ----
	 * lock(&ctx->uring_lock);
	 *                              lock(&ep->mtx);
	 *                              lock(&ctx->uring_lock);
	 * lock(&ep->mtx);
	 *
	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
	 * pushs them to do the flush.
	 */
2515 2516
	if (io_cqring_events(ctx) ||
	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
J
Jens Axboe 已提交
2517 2518 2519 2520 2521
		mask |= EPOLLIN | EPOLLRDNORM;

	return mask;
}

2522
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
2523
{
J
Jens Axboe 已提交
2524
	const struct cred *creds;
2525

2526
	creds = xa_erase(&ctx->personalities, id);
J
Jens Axboe 已提交
2527 2528
	if (creds) {
		put_cred(creds);
2529
		return 0;
J
Jens Axboe 已提交
2530
	}
2531 2532 2533 2534

	return -EINVAL;
}

2535 2536 2537
struct io_tctx_exit {
	struct callback_head		task_work;
	struct completion		completion;
2538
	struct io_ring_ctx		*ctx;
2539 2540
};

P
Pavel Begunkov 已提交
2541
static __cold void io_tctx_exit_cb(struct callback_head *cb)
2542 2543 2544 2545 2546 2547 2548 2549 2550 2551
{
	struct io_uring_task *tctx = current->io_uring;
	struct io_tctx_exit *work;

	work = container_of(cb, struct io_tctx_exit, task_work);
	/*
	 * When @in_idle, we're in cancellation and it's racy to remove the
	 * node. It'll be removed by the end of cancellation, just ignore it.
	 */
	if (!atomic_read(&tctx->in_idle))
2552
		io_uring_del_tctx_node((unsigned long)work->ctx);
2553 2554 2555
	complete(&work->completion);
}

P
Pavel Begunkov 已提交
2556
static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
2557 2558 2559 2560 2561 2562
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);

	return req->ctx == data;
}

P
Pavel Begunkov 已提交
2563
static __cold void io_ring_exit_work(struct work_struct *work)
2564
{
2565
	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
2566
	unsigned long timeout = jiffies + HZ * 60 * 5;
2567
	unsigned long interval = HZ / 20;
2568 2569 2570
	struct io_tctx_exit exit;
	struct io_tctx_node *node;
	int ret;
2571

2572 2573 2574 2575 2576 2577
	/*
	 * If we're doing polled IO and end up having requests being
	 * submitted async (out-of-line), then completions can come in while
	 * we're waiting for refs to drop. We need to reap these manually,
	 * as nobody else will be looking for them.
	 */
2578
	do {
2579
		io_uring_try_cancel_requests(ctx, NULL, true);
2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590
		if (ctx->sq_data) {
			struct io_sq_data *sqd = ctx->sq_data;
			struct task_struct *tsk;

			io_sq_thread_park(sqd);
			tsk = sqd->thread;
			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
				io_wq_cancel_cb(tsk->io_uring->io_wq,
						io_cancel_ctx_cb, ctx, true);
			io_sq_thread_unpark(sqd);
		}
2591

2592 2593
		io_req_caches_free(ctx);

2594 2595 2596 2597 2598
		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
			/* there is little hope left, don't run it too often */
			interval = HZ * 60;
		}
	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
2599

2600 2601 2602
	init_completion(&exit.completion);
	init_task_work(&exit.task_work, io_tctx_exit_cb);
	exit.ctx = ctx;
2603 2604 2605
	/*
	 * Some may use context even when all refs and requests have been put,
	 * and they are free to do so while still holding uring_lock or
2606
	 * completion_lock, see io_req_task_submit(). Apart from other work,
2607 2608
	 * this lock/unlock section also waits them to finish.
	 */
2609 2610
	mutex_lock(&ctx->uring_lock);
	while (!list_empty(&ctx->tctx_list)) {
2611 2612
		WARN_ON_ONCE(time_after(jiffies, timeout));

2613 2614
		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
					ctx_node);
2615 2616
		/* don't spin on a single task if cancellation failed */
		list_rotate_left(&ctx->tctx_list);
2617 2618 2619 2620 2621 2622 2623 2624 2625
		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
		if (WARN_ON_ONCE(ret))
			continue;

		mutex_unlock(&ctx->uring_lock);
		wait_for_completion(&exit.completion);
		mutex_lock(&ctx->uring_lock);
	}
	mutex_unlock(&ctx->uring_lock);
2626 2627
	spin_lock(&ctx->completion_lock);
	spin_unlock(&ctx->completion_lock);
2628

2629 2630 2631
	io_ring_ctx_free(ctx);
}

P
Pavel Begunkov 已提交
2632
static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2633
{
2634 2635 2636
	unsigned long index;
	struct creds *creds;

J
Jens Axboe 已提交
2637 2638
	mutex_lock(&ctx->uring_lock);
	percpu_ref_kill(&ctx->refs);
2639
	if (ctx->rings)
2640
		__io_cqring_overflow_flush(ctx, true);
2641 2642
	xa_for_each(&ctx->personalities, index, creds)
		io_unregister_personality(ctx, index);
J
Jens Axboe 已提交
2643 2644
	mutex_unlock(&ctx->uring_lock);

2645 2646 2647 2648 2649 2650 2651
	/* failed during ring init, it couldn't have issued any requests */
	if (ctx->rings) {
		io_kill_timeouts(ctx, NULL, true);
		io_poll_remove_all(ctx, NULL, true);
		/* if we failed setting up the ctx, we might not have any rings */
		io_iopoll_try_reap_events(ctx);
	}
2652

2653
	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
2654 2655 2656 2657 2658 2659 2660
	/*
	 * Use system_unbound_wq to avoid spawning tons of event kworkers
	 * if we're exiting a ton of rings at the same time. It just adds
	 * noise and overhead, there's no discernable change in runtime
	 * over using system_wq.
	 */
	queue_work(system_unbound_wq, &ctx->exit_work);
J
Jens Axboe 已提交
2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671
}

static int io_uring_release(struct inode *inode, struct file *file)
{
	struct io_ring_ctx *ctx = file->private_data;

	file->private_data = NULL;
	io_ring_ctx_wait_and_kill(ctx);
	return 0;
}

2672 2673
struct io_task_cancel {
	struct task_struct *task;
2674
	bool all;
2675
};
2676

2677
static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
2678
{
2679
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2680
	struct io_task_cancel *cancel = data;
2681

2682
	return io_match_task_safe(req, cancel->task, cancel->all);
2683 2684
}

P
Pavel Begunkov 已提交
2685 2686 2687
static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
					 struct task_struct *task,
					 bool cancel_all)
2688
{
2689
	struct io_defer_entry *de;
2690 2691
	LIST_HEAD(list);

2692
	spin_lock(&ctx->completion_lock);
2693
	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
2694
		if (io_match_task_safe(de->req, task, cancel_all)) {
2695 2696 2697 2698
			list_cut_position(&list, &ctx->defer_list, &de->list);
			break;
		}
	}
2699
	spin_unlock(&ctx->completion_lock);
2700 2701
	if (list_empty(&list))
		return false;
2702 2703 2704 2705

	while (!list_empty(&list)) {
		de = list_first_entry(&list, struct io_defer_entry, list);
		list_del_init(&de->list);
2706
		io_req_complete_failed(de->req, -ECANCELED);
2707 2708
		kfree(de);
	}
2709
	return true;
2710 2711
}

P
Pavel Begunkov 已提交
2712
static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735
{
	struct io_tctx_node *node;
	enum io_wq_cancel cret;
	bool ret = false;

	mutex_lock(&ctx->uring_lock);
	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
		struct io_uring_task *tctx = node->task->io_uring;

		/*
		 * io_wq will stay alive while we hold uring_lock, because it's
		 * killed after ctx nodes, which requires to take the lock.
		 */
		if (!tctx || !tctx->io_wq)
			continue;
		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
	}
	mutex_unlock(&ctx->uring_lock);

	return ret;
}

P
Pavel Begunkov 已提交
2736 2737 2738
static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
						struct task_struct *task,
						bool cancel_all)
2739
{
2740
	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
2741
	struct io_uring_task *tctx = task ? task->io_uring : NULL;
2742

2743 2744 2745 2746
	/* failed during ring init, it couldn't have issued any requests */
	if (!ctx->rings)
		return;

2747 2748 2749 2750
	while (1) {
		enum io_wq_cancel cret;
		bool ret = false;

2751 2752 2753 2754 2755 2756 2757
		if (!task) {
			ret |= io_uring_try_cancel_iowq(ctx);
		} else if (tctx && tctx->io_wq) {
			/*
			 * Cancels requests of all rings, not only @ctx, but
			 * it's fine as the task is in exit/exec.
			 */
2758
			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
2759 2760 2761 2762 2763
					       &cancel, true);
			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
		}

		/* SQPOLL thread does its own polling */
2764
		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
2765
		    (ctx->sq_data && ctx->sq_data->thread == current)) {
2766
			while (!wq_list_empty(&ctx->iopoll_list)) {
2767 2768 2769 2770 2771
				io_iopoll_try_reap_events(ctx);
				ret = true;
			}
		}

2772 2773 2774
		ret |= io_cancel_defer_files(ctx, task, cancel_all);
		ret |= io_poll_remove_all(ctx, task, cancel_all);
		ret |= io_kill_timeouts(ctx, task, cancel_all);
2775 2776
		if (task)
			ret |= io_run_task_work();
2777 2778 2779 2780 2781 2782
		if (!ret)
			break;
		cond_resched();
	}
}

2783
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
2784
{
2785
	if (tracked)
2786
		return atomic_read(&tctx->inflight_tracked);
2787 2788 2789
	return percpu_counter_sum(&tctx->inflight);
}

2790 2791
/*
 * Find any io_uring ctx that this task has registered or done IO on, and cancel
2792
 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
2793
 */
2794
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
2795
{
2796
	struct io_uring_task *tctx = current->io_uring;
2797
	struct io_ring_ctx *ctx;
2798 2799
	s64 inflight;
	DEFINE_WAIT(wait);
2800

2801 2802
	WARN_ON_ONCE(sqd && sqd->thread != current);

2803 2804
	if (!current->io_uring)
		return;
2805 2806 2807
	if (tctx->io_wq)
		io_wq_exit_start(tctx->io_wq);

2808 2809
	atomic_inc(&tctx->in_idle);
	do {
2810
		io_uring_drop_tctx_refs(current);
2811
		/* read completions before cancelations */
2812
		inflight = tctx_inflight(tctx, !cancel_all);
2813 2814
		if (!inflight)
			break;
2815

2816 2817 2818
		if (!sqd) {
			struct io_tctx_node *node;
			unsigned long index;
2819

2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831
			xa_for_each(&tctx->xa, index, node) {
				/* sqpoll task will cancel all its requests */
				if (node->ctx->sq_data)
					continue;
				io_uring_try_cancel_requests(node->ctx, current,
							     cancel_all);
			}
		} else {
			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
				io_uring_try_cancel_requests(ctx, current,
							     cancel_all);
		}
2832

2833 2834
		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
		io_run_task_work();
2835
		io_uring_drop_tctx_refs(current);
2836

2837
		/*
2838 2839 2840
		 * If we've seen completions, retry without waiting. This
		 * avoids a race where a completion comes in before we did
		 * prepare_to_wait().
2841
		 */
2842
		if (inflight == tctx_inflight(tctx, !cancel_all))
2843
			schedule();
2844
		finish_wait(&tctx->wait, &wait);
2845
	} while (1);
2846

P
Pavel Begunkov 已提交
2847
	io_uring_clean_tctx(tctx);
2848
	if (cancel_all) {
2849 2850 2851 2852 2853
		/*
		 * We shouldn't run task_works after cancel, so just leave
		 * ->in_idle set for normal exit.
		 */
		atomic_dec(&tctx->in_idle);
2854 2855 2856
		/* for exec all current's requests should be gone, kill tctx */
		__io_uring_free(current);
	}
2857 2858
}

2859
void __io_uring_cancel(bool cancel_all)
2860
{
2861
	io_uring_cancel_generic(cancel_all, NULL);
2862 2863
}

2864 2865
static void *io_uring_validate_mmap_request(struct file *file,
					    loff_t pgoff, size_t sz)
J
Jens Axboe 已提交
2866 2867
{
	struct io_ring_ctx *ctx = file->private_data;
2868
	loff_t offset = pgoff << PAGE_SHIFT;
J
Jens Axboe 已提交
2869 2870 2871 2872 2873
	struct page *page;
	void *ptr;

	switch (offset) {
	case IORING_OFF_SQ_RING:
2874 2875
	case IORING_OFF_CQ_RING:
		ptr = ctx->rings;
J
Jens Axboe 已提交
2876 2877 2878 2879 2880
		break;
	case IORING_OFF_SQES:
		ptr = ctx->sq_sqes;
		break;
	default:
2881
		return ERR_PTR(-EINVAL);
J
Jens Axboe 已提交
2882 2883 2884
	}

	page = virt_to_head_page(ptr);
2885
	if (sz > page_size(page))
2886 2887 2888 2889 2890 2891 2892
		return ERR_PTR(-EINVAL);

	return ptr;
}

#ifdef CONFIG_MMU

P
Pavel Begunkov 已提交
2893
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2894 2895 2896 2897 2898 2899 2900 2901
{
	size_t sz = vma->vm_end - vma->vm_start;
	unsigned long pfn;
	void *ptr;

	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
	if (IS_ERR(ptr))
		return PTR_ERR(ptr);
J
Jens Axboe 已提交
2902 2903 2904 2905 2906

	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933
#else /* !CONFIG_MMU */

static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
}

static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
{
	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
}

static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
	unsigned long addr, unsigned long len,
	unsigned long pgoff, unsigned long flags)
{
	void *ptr;

	ptr = io_uring_validate_mmap_request(file, pgoff, len);
	if (IS_ERR(ptr))
		return PTR_ERR(ptr);

	return (unsigned long) ptr;
}

#endif /* !CONFIG_MMU */

2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946
static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{
	if (flags & IORING_ENTER_EXT_ARG) {
		struct io_uring_getevents_arg arg;

		if (argsz != sizeof(arg))
			return -EINVAL;
		if (copy_from_user(&arg, argp, sizeof(arg)))
			return -EFAULT;
	}
	return 0;
}

2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
			  struct __kernel_timespec __user **ts,
			  const sigset_t __user **sig)
{
	struct io_uring_getevents_arg arg;

	/*
	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
	 * is just a pointer to the sigset_t.
	 */
	if (!(flags & IORING_ENTER_EXT_ARG)) {
		*sig = (const sigset_t __user *) argp;
		*ts = NULL;
		return 0;
	}

	/*
	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
	 * timespec and sigset_t pointers if good.
	 */
	if (*argsz != sizeof(arg))
		return -EINVAL;
	if (copy_from_user(&arg, argp, sizeof(arg)))
		return -EFAULT;
2971 2972
	if (arg.pad)
		return -EINVAL;
2973 2974 2975 2976 2977 2978
	*sig = u64_to_user_ptr(arg.sigmask);
	*argsz = arg.sigmask_sz;
	*ts = u64_to_user_ptr(arg.ts);
	return 0;
}

J
Jens Axboe 已提交
2979
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
2980 2981
		u32, min_complete, u32, flags, const void __user *, argp,
		size_t, argsz)
J
Jens Axboe 已提交
2982 2983 2984
{
	struct io_ring_ctx *ctx;
	struct fd f;
2985
	long ret;
J
Jens Axboe 已提交
2986

2987
	io_run_task_work();
2988

2989
	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
2990 2991
			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
			       IORING_ENTER_REGISTERED_RING)))
J
Jens Axboe 已提交
2992 2993
		return -EINVAL;

2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004
	/*
	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
	 * need only dereference our task private array to find it.
	 */
	if (flags & IORING_ENTER_REGISTERED_RING) {
		struct io_uring_task *tctx = current->io_uring;

		if (!tctx || fd >= IO_RINGFD_REG_MAX)
			return -EINVAL;
		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
		f.file = tctx->registered_rings[fd];
3005
		f.flags = 0;
3006 3007 3008
	} else {
		f = fdget(fd);
	}
J
Jens Axboe 已提交
3009

3010 3011 3012
	if (unlikely(!f.file))
		return -EBADF;

J
Jens Axboe 已提交
3013
	ret = -EOPNOTSUPP;
3014
	if (unlikely(!io_is_uring_fops(f.file)))
J
Jens Axboe 已提交
3015 3016 3017 3018
		goto out_fput;

	ret = -ENXIO;
	ctx = f.file->private_data;
3019
	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
J
Jens Axboe 已提交
3020 3021
		goto out_fput;

3022
	ret = -EBADFD;
3023
	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
3024 3025
		goto out;

J
Jens Axboe 已提交
3026 3027 3028 3029 3030
	/*
	 * For SQ polling, the thread will do all submissions and completions.
	 * Just return the requested submit count, and wake the thread if
	 * we were asked to.
	 */
3031
	ret = 0;
J
Jens Axboe 已提交
3032
	if (ctx->flags & IORING_SETUP_SQPOLL) {
3033
		io_cqring_overflow_flush(ctx);
3034

3035 3036
		if (unlikely(ctx->sq_data->thread == NULL)) {
			ret = -EOWNERDEAD;
3037
			goto out;
3038
		}
J
Jens Axboe 已提交
3039
		if (flags & IORING_ENTER_SQ_WAKEUP)
3040
			wake_up(&ctx->sq_data->wait);
3041 3042 3043 3044 3045
		if (flags & IORING_ENTER_SQ_WAIT) {
			ret = io_sqpoll_wait_sq(ctx);
			if (ret)
				goto out;
		}
3046
		ret = to_submit;
3047
	} else if (to_submit) {
3048
		ret = io_uring_add_tctx_node(ctx);
3049 3050
		if (unlikely(ret))
			goto out;
3051

J
Jens Axboe 已提交
3052
		mutex_lock(&ctx->uring_lock);
3053 3054
		ret = io_submit_sqes(ctx, to_submit);
		if (ret != to_submit) {
3055
			mutex_unlock(&ctx->uring_lock);
3056
			goto out;
3057 3058 3059 3060
		}
		if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
			goto iopoll_locked;
		mutex_unlock(&ctx->uring_lock);
J
Jens Axboe 已提交
3061 3062
	}
	if (flags & IORING_ENTER_GETEVENTS) {
3063
		int ret2;
3064
		if (ctx->syscall_iopoll) {
3065 3066 3067 3068 3069 3070 3071 3072
			/*
			 * We disallow the app entering submit/complete with
			 * polling, but we still need to lock the ring to
			 * prevent racing with polled issue that got punted to
			 * a workqueue.
			 */
			mutex_lock(&ctx->uring_lock);
iopoll_locked:
3073 3074 3075 3076 3077
			ret2 = io_validate_ext_arg(flags, argp, argsz);
			if (likely(!ret2)) {
				min_complete = min(min_complete,
						   ctx->cq_entries);
				ret2 = io_iopoll_check(ctx, min_complete);
3078 3079
			}
			mutex_unlock(&ctx->uring_lock);
J
Jens Axboe 已提交
3080
		} else {
3081 3082 3083
			const sigset_t __user *sig;
			struct __kernel_timespec __user *ts;

3084 3085 3086 3087 3088 3089 3090
			ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
			if (likely(!ret2)) {
				min_complete = min(min_complete,
						   ctx->cq_entries);
				ret2 = io_cqring_wait(ctx, min_complete, sig,
						      argsz, ts);
			}
J
Jens Axboe 已提交
3091
		}
3092

3093
		if (!ret) {
3094
			ret = ret2;
J
Jens Axboe 已提交
3095

3096 3097 3098 3099 3100 3101 3102 3103
			/*
			 * EBADR indicates that one or more CQE were dropped.
			 * Once the user has been informed we can clear the bit
			 * as they are obviously ok with those drops.
			 */
			if (unlikely(ret2 == -EBADR))
				clear_bit(IO_CHECK_CQ_DROPPED_BIT,
					  &ctx->check_cq);
J
Jens Axboe 已提交
3104
		}
J
Jens Axboe 已提交
3105 3106
	}

3107
out:
3108
	percpu_ref_put(&ctx->refs);
J
Jens Axboe 已提交
3109
out_fput:
3110
	fdput(f);
3111
	return ret;
J
Jens Axboe 已提交
3112 3113 3114 3115 3116
}

static const struct file_operations io_uring_fops = {
	.release	= io_uring_release,
	.mmap		= io_uring_mmap,
3117 3118 3119 3120
#ifndef CONFIG_MMU
	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
#endif
J
Jens Axboe 已提交
3121
	.poll		= io_uring_poll,
3122
#ifdef CONFIG_PROC_FS
3123
	.show_fdinfo	= io_uring_show_fdinfo,
3124
#endif
J
Jens Axboe 已提交
3125 3126
};

3127 3128 3129 3130 3131
bool io_is_uring_fops(struct file *file)
{
	return file->f_op == &io_uring_fops;
}

P
Pavel Begunkov 已提交
3132 3133
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
					 struct io_uring_params *p)
J
Jens Axboe 已提交
3134
{
3135 3136
	struct io_rings *rings;
	size_t size, sq_array_offset;
J
Jens Axboe 已提交
3137

3138 3139 3140 3141
	/* make sure these are sane, as we already accounted them */
	ctx->sq_entries = p->sq_entries;
	ctx->cq_entries = p->cq_entries;

3142
	size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
3143 3144 3145 3146 3147
	if (size == SIZE_MAX)
		return -EOVERFLOW;

	rings = io_mem_alloc(size);
	if (!rings)
J
Jens Axboe 已提交
3148 3149
		return -ENOMEM;

3150 3151 3152 3153 3154 3155
	ctx->rings = rings;
	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
	rings->sq_ring_mask = p->sq_entries - 1;
	rings->cq_ring_mask = p->cq_entries - 1;
	rings->sq_ring_entries = p->sq_entries;
	rings->cq_ring_entries = p->cq_entries;
J
Jens Axboe 已提交
3156

3157 3158 3159 3160
	if (p->flags & IORING_SETUP_SQE128)
		size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
	else
		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3161 3162 3163
	if (size == SIZE_MAX) {
		io_mem_free(ctx->rings);
		ctx->rings = NULL;
J
Jens Axboe 已提交
3164
		return -EOVERFLOW;
3165
	}
J
Jens Axboe 已提交
3166 3167

	ctx->sq_sqes = io_mem_alloc(size);
3168 3169 3170
	if (!ctx->sq_sqes) {
		io_mem_free(ctx->rings);
		ctx->rings = NULL;
J
Jens Axboe 已提交
3171
		return -ENOMEM;
3172
	}
J
Jens Axboe 已提交
3173 3174 3175 3176

	return 0;
}

3177 3178 3179 3180 3181 3182 3183 3184
static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
{
	int ret, fd;

	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
	if (fd < 0)
		return fd;

3185
	ret = io_uring_add_tctx_node(ctx);
3186 3187 3188 3189 3190 3191 3192 3193
	if (ret) {
		put_unused_fd(fd);
		return ret;
	}
	fd_install(fd, file);
	return fd;
}

J
Jens Axboe 已提交
3194 3195 3196 3197 3198 3199
/*
 * Allocate an anonymous fd, this is what constitutes the application
 * visible backing of an io_uring instance. The application mmaps this
 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
 * we have to tie this fd to a socket for file garbage collection purposes.
 */
3200
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
3201 3202
{
	struct file *file;
3203
#if defined(CONFIG_UNIX)
J
Jens Axboe 已提交
3204 3205 3206 3207 3208
	int ret;

	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
				&ctx->ring_sock);
	if (ret)
3209
		return ERR_PTR(ret);
J
Jens Axboe 已提交
3210 3211
#endif

3212 3213
	file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
					 O_RDWR | O_CLOEXEC, NULL);
J
Jens Axboe 已提交
3214
#if defined(CONFIG_UNIX)
3215 3216 3217 3218 3219
	if (IS_ERR(file)) {
		sock_release(ctx->ring_sock);
		ctx->ring_sock = NULL;
	} else {
		ctx->ring_sock->file = file;
3220
	}
J
Jens Axboe 已提交
3221
#endif
3222
	return file;
J
Jens Axboe 已提交
3223 3224
}

P
Pavel Begunkov 已提交
3225 3226
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
				  struct io_uring_params __user *params)
J
Jens Axboe 已提交
3227 3228
{
	struct io_ring_ctx *ctx;
3229
	struct file *file;
J
Jens Axboe 已提交
3230 3231
	int ret;

3232
	if (!entries)
J
Jens Axboe 已提交
3233
		return -EINVAL;
3234 3235 3236 3237 3238
	if (entries > IORING_MAX_ENTRIES) {
		if (!(p->flags & IORING_SETUP_CLAMP))
			return -EINVAL;
		entries = IORING_MAX_ENTRIES;
	}
J
Jens Axboe 已提交
3239 3240 3241 3242 3243

	/*
	 * Use twice as many entries for the CQ ring. It's possible for the
	 * application to drive a higher depth than the size of the SQ ring,
	 * since the sqes are only used at submission time. This allows for
3244 3245 3246
	 * some flexibility in overcommitting a bit. If the application has
	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
	 * of CQ ring entries manually.
J
Jens Axboe 已提交
3247 3248
	 */
	p->sq_entries = roundup_pow_of_two(entries);
3249 3250 3251 3252 3253 3254
	if (p->flags & IORING_SETUP_CQSIZE) {
		/*
		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
		 * to a power-of-two, if it isn't already. We do NOT impose
		 * any cq vs sq ring sizing.
		 */
3255
		if (!p->cq_entries)
3256
			return -EINVAL;
3257 3258 3259 3260 3261
		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
			if (!(p->flags & IORING_SETUP_CLAMP))
				return -EINVAL;
			p->cq_entries = IORING_MAX_CQ_ENTRIES;
		}
3262 3263 3264
		p->cq_entries = roundup_pow_of_two(p->cq_entries);
		if (p->cq_entries < p->sq_entries)
			return -EINVAL;
3265 3266 3267
	} else {
		p->cq_entries = 2 * p->sq_entries;
	}
J
Jens Axboe 已提交
3268 3269

	ctx = io_ring_ctx_alloc(p);
J
Jens Axboe 已提交
3270
	if (!ctx)
J
Jens Axboe 已提交
3271
		return -ENOMEM;
3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282

	/*
	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
	 * space applications don't need to do io completion events
	 * polling again, they can rely on io_sq_thread to do polling
	 * work, which can reduce cpu usage and uring_lock contention.
	 */
	if (ctx->flags & IORING_SETUP_IOPOLL &&
	    !(ctx->flags & IORING_SETUP_SQPOLL))
		ctx->syscall_iopoll = 1;

J
Jens Axboe 已提交
3283
	ctx->compat = in_compat_syscall();
J
Jens Axboe 已提交
3284 3285
	if (!capable(CAP_IPC_LOCK))
		ctx->user = get_uid(current_user());
3286

3287
	/*
3288 3289
	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
	 * COOP_TASKRUN is set, then IPIs are never needed by the app.
3290
	 */
3291 3292 3293
	ret = -EINVAL;
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		/* IPI related flags don't make sense with SQPOLL */
3294 3295
		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
				  IORING_SETUP_TASKRUN_FLAG))
3296
			goto err;
3297
		ctx->notify_method = TWA_SIGNAL_NO_IPI;
3298 3299 3300
	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
		ctx->notify_method = TWA_SIGNAL_NO_IPI;
	} else {
3301 3302
		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
			goto err;
3303
		ctx->notify_method = TWA_SIGNAL;
3304
	}
3305

3306 3307 3308 3309 3310 3311
	/*
	 * This is just grabbed for accounting purposes. When a process exits,
	 * the mm is exited and dropped before the files, hence we need to hang
	 * on to this mm purely for the purposes of being able to unaccount
	 * memory (locked/pinned vm). It's not used for anything else.
	 */
3312
	mmgrab(current->mm);
3313
	ctx->mm_account = current->mm;
3314

J
Jens Axboe 已提交
3315 3316 3317 3318
	ret = io_allocate_scq_urings(ctx, p);
	if (ret)
		goto err;

3319
	ret = io_sq_offload_create(ctx, p);
J
Jens Axboe 已提交
3320 3321
	if (ret)
		goto err;
3322
	/* always set a rsrc node */
3323 3324 3325
	ret = io_rsrc_node_switch_start(ctx);
	if (ret)
		goto err;
3326
	io_rsrc_node_switch(ctx, NULL);
J
Jens Axboe 已提交
3327 3328

	memset(&p->sq_off, 0, sizeof(p->sq_off));
3329 3330 3331 3332 3333 3334 3335
	p->sq_off.head = offsetof(struct io_rings, sq.head);
	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
J
Jens Axboe 已提交
3336 3337

	memset(&p->cq_off, 0, sizeof(p->cq_off));
3338 3339 3340 3341 3342 3343
	p->cq_off.head = offsetof(struct io_rings, cq.head);
	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
	p->cq_off.cqes = offsetof(struct io_rings, cqes);
3344
	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
3345

3346 3347
	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
3348
			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
3349
			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
3350
			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
3351 3352
			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
			IORING_FEAT_LINKED_FILE;
3353 3354 3355 3356 3357

	if (copy_to_user(params, p, sizeof(*p))) {
		ret = -EFAULT;
		goto err;
	}
3358

3359 3360 3361 3362 3363 3364
	file = io_uring_get_file(ctx);
	if (IS_ERR(file)) {
		ret = PTR_ERR(file);
		goto err;
	}

3365 3366 3367 3368
	/*
	 * Install ring fd as the very last thing, so we don't risk someone
	 * having closed it before we finish setup
	 */
3369 3370 3371 3372 3373 3374
	ret = io_uring_install_fd(ctx, file);
	if (ret < 0) {
		/* fput will clean it up */
		fput(file);
		return ret;
	}
3375

3376
	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
J
Jens Axboe 已提交
3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399
	return ret;
err:
	io_ring_ctx_wait_and_kill(ctx);
	return ret;
}

/*
 * Sets up an aio uring context, and returns the fd. Applications asks for a
 * ring size, we return the actual sq/cq ring sizes (among other things) in the
 * params structure passed in.
 */
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
	struct io_uring_params p;
	int i;

	if (copy_from_user(&p, params, sizeof(p)))
		return -EFAULT;
	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
		if (p.resv[i])
			return -EINVAL;
	}

J
Jens Axboe 已提交
3400
	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
3401
			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
3402
			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
3403
			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
3404
			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
S
Stefan Roesch 已提交
3405
			IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
J
Jens Axboe 已提交
3406 3407
		return -EINVAL;

3408
	return io_uring_create(entries, &p, params);
J
Jens Axboe 已提交
3409 3410 3411 3412 3413 3414 3415 3416
}

SYSCALL_DEFINE2(io_uring_setup, u32, entries,
		struct io_uring_params __user *, params)
{
	return io_uring_setup(entries, params);
}

P
Pavel Begunkov 已提交
3417 3418
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
			   unsigned nr_args)
3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456
{
	struct io_uring_probe *p;
	size_t size;
	int i, ret;

	size = struct_size(p, ops, nr_args);
	if (size == SIZE_MAX)
		return -EOVERFLOW;
	p = kzalloc(size, GFP_KERNEL);
	if (!p)
		return -ENOMEM;

	ret = -EFAULT;
	if (copy_from_user(p, arg, size))
		goto out;
	ret = -EINVAL;
	if (memchr_inv(p, 0, size))
		goto out;

	p->last_op = IORING_OP_LAST - 1;
	if (nr_args > IORING_OP_LAST)
		nr_args = IORING_OP_LAST;

	for (i = 0; i < nr_args; i++) {
		p->ops[i].op = i;
		if (!io_op_defs[i].not_supported)
			p->ops[i].flags = IO_URING_OP_SUPPORTED;
	}
	p->ops_len = i;

	ret = 0;
	if (copy_to_user(arg, p, size))
		ret = -EFAULT;
out:
	kfree(p);
	return ret;
}

3457 3458
static int io_register_personality(struct io_ring_ctx *ctx)
{
J
Jens Axboe 已提交
3459
	const struct cred *creds;
3460
	u32 id;
J
Jens Axboe 已提交
3461
	int ret;
3462

J
Jens Axboe 已提交
3463
	creds = get_current_cred();
J
Jens Axboe 已提交
3464

3465 3466
	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
3467 3468 3469 3470 3471
	if (ret < 0) {
		put_cred(creds);
		return ret;
	}
	return id;
3472 3473
}

P
Pavel Begunkov 已提交
3474 3475
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
					   void __user *arg, unsigned int nr_args)
3476 3477 3478 3479 3480
{
	struct io_uring_restriction *res;
	size_t size;
	int i, ret;

3481 3482 3483 3484
	/* Restrictions allowed only if rings started disabled */
	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EBADFD;

3485
	/* We allow only a single restrictions registration */
3486
	if (ctx->restrictions.registered)
3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537
		return -EBUSY;

	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
		return -EINVAL;

	size = array_size(nr_args, sizeof(*res));
	if (size == SIZE_MAX)
		return -EOVERFLOW;

	res = memdup_user(arg, size);
	if (IS_ERR(res))
		return PTR_ERR(res);

	ret = 0;

	for (i = 0; i < nr_args; i++) {
		switch (res[i].opcode) {
		case IORING_RESTRICTION_REGISTER_OP:
			if (res[i].register_op >= IORING_REGISTER_LAST) {
				ret = -EINVAL;
				goto out;
			}

			__set_bit(res[i].register_op,
				  ctx->restrictions.register_op);
			break;
		case IORING_RESTRICTION_SQE_OP:
			if (res[i].sqe_op >= IORING_OP_LAST) {
				ret = -EINVAL;
				goto out;
			}

			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
			break;
		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
			break;
		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
			break;
		default:
			ret = -EINVAL;
			goto out;
		}
	}

out:
	/* Reset all restrictions if an error happened */
	if (ret != 0)
		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
	else
3538
		ctx->restrictions.registered = true;
3539 3540 3541 3542 3543

	kfree(res);
	return ret;
}

3544 3545 3546 3547 3548 3549 3550 3551
static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EBADFD;

	if (ctx->restrictions.registered)
		ctx->restricted = 1;

3552 3553 3554
	ctx->flags &= ~IORING_SETUP_R_DISABLED;
	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
		wake_up(&ctx->sq_data->wait);
3555 3556 3557
	return 0;
}

P
Pavel Begunkov 已提交
3558 3559
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
				       void __user *arg, unsigned len)
3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574
{
	struct io_uring_task *tctx = current->io_uring;
	cpumask_var_t new_mask;
	int ret;

	if (!tctx || !tctx->io_wq)
		return -EINVAL;

	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
		return -ENOMEM;

	cpumask_clear(new_mask);
	if (len > cpumask_size())
		len = cpumask_size();

3575 3576 3577 3578 3579 3580 3581 3582 3583
	if (in_compat_syscall()) {
		ret = compat_get_bitmap(cpumask_bits(new_mask),
					(const compat_ulong_t __user *)arg,
					len * 8 /* CHAR_BIT */);
	} else {
		ret = copy_from_user(new_mask, arg, len);
	}

	if (ret) {
3584 3585 3586 3587 3588 3589 3590 3591 3592
		free_cpumask_var(new_mask);
		return -EFAULT;
	}

	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
	free_cpumask_var(new_mask);
	return ret;
}

P
Pavel Begunkov 已提交
3593
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
3594 3595 3596 3597 3598 3599 3600 3601 3602
{
	struct io_uring_task *tctx = current->io_uring;

	if (!tctx || !tctx->io_wq)
		return -EINVAL;

	return io_wq_cpu_affinity(tctx->io_wq, NULL);
}

P
Pavel Begunkov 已提交
3603 3604
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
					       void __user *arg)
3605
	__must_hold(&ctx->uring_lock)
3606
{
3607
	struct io_tctx_node *node;
3608 3609
	struct io_uring_task *tctx = NULL;
	struct io_sq_data *sqd = NULL;
3610 3611 3612 3613 3614 3615 3616 3617 3618
	__u32 new_count[2];
	int i, ret;

	if (copy_from_user(new_count, arg, sizeof(new_count)))
		return -EFAULT;
	for (i = 0; i < ARRAY_SIZE(new_count); i++)
		if (new_count[i] > INT_MAX)
			return -EINVAL;

3619 3620 3621
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		sqd = ctx->sq_data;
		if (sqd) {
3622 3623 3624 3625 3626
			/*
			 * Observe the correct sqd->lock -> ctx->uring_lock
			 * ordering. Fine to drop uring_lock here, we hold
			 * a ref to the ctx.
			 */
3627
			refcount_inc(&sqd->refs);
3628
			mutex_unlock(&ctx->uring_lock);
3629
			mutex_lock(&sqd->lock);
3630
			mutex_lock(&ctx->uring_lock);
3631 3632
			if (sqd->thread)
				tctx = sqd->thread->io_uring;
3633 3634 3635 3636 3637
		}
	} else {
		tctx = current->io_uring;
	}

3638
	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
3639

3640 3641 3642
	for (i = 0; i < ARRAY_SIZE(new_count); i++)
		if (new_count[i])
			ctx->iowq_limits[i] = new_count[i];
3643 3644 3645 3646 3647 3648 3649 3650 3651
	ctx->iowq_limits_set = true;

	if (tctx && tctx->io_wq) {
		ret = io_wq_max_workers(tctx->io_wq, new_count);
		if (ret)
			goto err;
	} else {
		memset(new_count, 0, sizeof(new_count));
	}
3652

3653
	if (sqd) {
3654
		mutex_unlock(&sqd->lock);
3655 3656
		io_put_sq_data(sqd);
	}
3657 3658 3659 3660

	if (copy_to_user(arg, new_count, sizeof(new_count)))
		return -EFAULT;

3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676
	/* that's it for SQPOLL, only the SQPOLL task creates requests */
	if (sqd)
		return 0;

	/* now propagate the restriction to all registered users */
	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
		struct io_uring_task *tctx = node->task->io_uring;

		if (WARN_ON_ONCE(!tctx->io_wq))
			continue;

		for (i = 0; i < ARRAY_SIZE(new_count); i++)
			new_count[i] = ctx->iowq_limits[i];
		/* ignore errors, it always returns zero anyway */
		(void)io_wq_max_workers(tctx->io_wq, new_count);
	}
3677
	return 0;
3678
err:
3679
	if (sqd) {
3680
		mutex_unlock(&sqd->lock);
3681 3682
		io_put_sq_data(sqd);
	}
3683
	return ret;
3684 3685
}

3686 3687
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
			       void __user *arg, unsigned nr_args)
3688 3689
	__releases(ctx->uring_lock)
	__acquires(ctx->uring_lock)
3690 3691 3692
{
	int ret;

3693 3694 3695 3696 3697 3698 3699 3700
	/*
	 * We're inside the ring mutex, if the ref is already dying, then
	 * someone else killed the ctx or is already going through
	 * io_uring_register().
	 */
	if (percpu_ref_is_dying(&ctx->refs))
		return -ENXIO;

3701 3702 3703 3704 3705 3706 3707 3708
	if (ctx->restricted) {
		if (opcode >= IORING_REGISTER_LAST)
			return -EINVAL;
		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
		if (!test_bit(opcode, ctx->restrictions.register_op))
			return -EACCES;
	}

3709 3710
	switch (opcode) {
	case IORING_REGISTER_BUFFERS:
3711 3712 3713
		ret = -EFAULT;
		if (!arg)
			break;
3714
		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
3715 3716 3717 3718 3719
		break;
	case IORING_UNREGISTER_BUFFERS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
3720
		ret = io_sqe_buffers_unregister(ctx);
3721
		break;
J
Jens Axboe 已提交
3722
	case IORING_REGISTER_FILES:
3723 3724 3725
		ret = -EFAULT;
		if (!arg)
			break;
3726
		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
J
Jens Axboe 已提交
3727 3728 3729 3730 3731 3732 3733
		break;
	case IORING_UNREGISTER_FILES:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_sqe_files_unregister(ctx);
		break;
3734
	case IORING_REGISTER_FILES_UPDATE:
3735
		ret = io_register_files_update(ctx, arg, nr_args);
3736
		break;
3737 3738 3739 3740
	case IORING_REGISTER_EVENTFD:
		ret = -EINVAL;
		if (nr_args != 1)
			break;
3741 3742 3743 3744 3745
		ret = io_eventfd_register(ctx, arg, 0);
		break;
	case IORING_REGISTER_EVENTFD_ASYNC:
		ret = -EINVAL;
		if (nr_args != 1)
3746
			break;
3747
		ret = io_eventfd_register(ctx, arg, 1);
3748 3749 3750 3751 3752 3753 3754
		break;
	case IORING_UNREGISTER_EVENTFD:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_eventfd_unregister(ctx);
		break;
3755 3756 3757 3758 3759 3760
	case IORING_REGISTER_PROBE:
		ret = -EINVAL;
		if (!arg || nr_args > 256)
			break;
		ret = io_probe(ctx, arg, nr_args);
		break;
3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772
	case IORING_REGISTER_PERSONALITY:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_register_personality(ctx);
		break;
	case IORING_UNREGISTER_PERSONALITY:
		ret = -EINVAL;
		if (arg)
			break;
		ret = io_unregister_personality(ctx, nr_args);
		break;
3773 3774 3775 3776 3777 3778
	case IORING_REGISTER_ENABLE_RINGS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_register_enable_rings(ctx);
		break;
3779 3780 3781
	case IORING_REGISTER_RESTRICTIONS:
		ret = io_register_restrictions(ctx, arg, nr_args);
		break;
3782 3783 3784 3785 3786 3787 3788 3789 3790
	case IORING_REGISTER_FILES2:
		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
		break;
	case IORING_REGISTER_FILES_UPDATE2:
		ret = io_register_rsrc_update(ctx, arg, nr_args,
					      IORING_RSRC_FILE);
		break;
	case IORING_REGISTER_BUFFERS2:
		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
3791
		break;
3792 3793 3794
	case IORING_REGISTER_BUFFERS_UPDATE:
		ret = io_register_rsrc_update(ctx, arg, nr_args,
					      IORING_RSRC_BUFFER);
3795
		break;
3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807
	case IORING_REGISTER_IOWQ_AFF:
		ret = -EINVAL;
		if (!arg || !nr_args)
			break;
		ret = io_register_iowq_aff(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_IOWQ_AFF:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_unregister_iowq_aff(ctx);
		break;
3808 3809 3810 3811 3812 3813
	case IORING_REGISTER_IOWQ_MAX_WORKERS:
		ret = -EINVAL;
		if (!arg || nr_args != 2)
			break;
		ret = io_register_iowq_max_workers(ctx, arg);
		break;
3814 3815 3816 3817 3818 3819
	case IORING_REGISTER_RING_FDS:
		ret = io_ringfd_register(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_RING_FDS:
		ret = io_ringfd_unregister(ctx, arg, nr_args);
		break;
3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831
	case IORING_REGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_pbuf_ring(ctx, arg);
		break;
	case IORING_UNREGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_unregister_pbuf_ring(ctx, arg);
		break;
3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851
	default:
		ret = -EINVAL;
		break;
	}

	return ret;
}

SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
		void __user *, arg, unsigned int, nr_args)
{
	struct io_ring_ctx *ctx;
	long ret = -EBADF;
	struct fd f;

	f = fdget(fd);
	if (!f.file)
		return -EBADF;

	ret = -EOPNOTSUPP;
3852
	if (!io_is_uring_fops(f.file))
3853 3854 3855 3856
		goto out_fput;

	ctx = f.file->private_data;

3857 3858
	io_run_task_work();

3859 3860 3861
	mutex_lock(&ctx->uring_lock);
	ret = __io_uring_register(ctx, opcode, arg, nr_args);
	mutex_unlock(&ctx->uring_lock);
U
Usama Arif 已提交
3862
	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
3863 3864 3865 3866 3867
out_fput:
	fdput(f);
	return ret;
}

J
Jens Axboe 已提交
3868 3869
static int __init io_uring_init(void)
{
3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884
#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
} while (0)

#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
P
Pavel Begunkov 已提交
3885
	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
3886 3887 3888 3889 3890
	BUILD_BUG_SQE_ELEM(24, __u32,  len);
	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
3891 3892
	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
3893 3894 3895 3896 3897 3898 3899 3900
	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
P
Pavel Begunkov 已提交
3901
	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
3902 3903
	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
3904
	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
3905
	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
P
Pavel Begunkov 已提交
3906
	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
3907
	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
3908
	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
3909

3910 3911 3912 3913
	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
		     sizeof(struct io_uring_rsrc_update));
	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
		     sizeof(struct io_uring_rsrc_update2));
3914 3915

	/* ->buf_index is u16 */
3916 3917 3918
	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
		     offsetof(struct io_uring_buf_ring, tail));
3919

3920 3921
	/* should fit into one byte */
	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
3922 3923
	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
3924

3925
	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
3926

3927 3928
	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));

3929
	io_uring_optable_init();
3930

3931 3932
	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
				SLAB_ACCOUNT);
J
Jens Axboe 已提交
3933 3934 3935
	return 0;
};
__initcall(io_uring_init);