io_uring.c 100.2 KB
Newer Older
J
Jens Axboe 已提交
1 2 3 4 5 6
// SPDX-License-Identifier: GPL-2.0
/*
 * Shared application/kernel submission and completion ring pairs, for
 * supporting fast/efficient IO.
 *
 * A note on the read/write ordering memory barriers that are matched between
S
Stefan Bühler 已提交
7 8 9 10 11 12 13
 * the application and kernel side.
 *
 * After the application reads the CQ ring tail, it must use an
 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
 * before writing the tail (using smp_load_acquire to read the tail will
 * do). It also needs a smp_mb() before updating CQ head (ordering the
 * entry load(s) with the head store), pairing with an implicit barrier
P
Pavel Begunkov 已提交
14
 * through a control-dependency in io_get_cqe (smp_store_release to
S
Stefan Bühler 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28
 * store head will do). Failure to do so could lead to reading invalid
 * CQ entries.
 *
 * Likewise, the application must use an appropriate smp_wmb() before
 * writing the SQ tail (ordering SQ entry stores with the tail store),
 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
 * to store the tail will do). And it needs a barrier ordering the SQ
 * head load before writing new SQ entries (smp_load_acquire to read
 * head will do).
 *
 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
 * updating the SQ tail; a full memory barrier smp_mb() is needed
 * between.
J
Jens Axboe 已提交
29 30 31 32 33 34 35 36 37 38 39
 *
 * Also see the examples in the liburing library:
 *
 *	git://git.kernel.dk/liburing
 *
 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
 * from data shared between the kernel and application. This is done both
 * for ordering purposes, but also to ensure that once a value is loaded from
 * data that the application could potentially modify, it remains stable.
 *
 * Copyright (C) 2018-2019 Jens Axboe
C
Christoph Hellwig 已提交
40
 * Copyright (c) 2018-2019 Christoph Hellwig
J
Jens Axboe 已提交
41 42 43 44 45
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
46
#include <net/compat.h>
J
Jens Axboe 已提交
47 48
#include <linux/refcount.h>
#include <linux/uio.h>
49
#include <linux/bits.h>
J
Jens Axboe 已提交
50 51 52 53 54 55 56 57 58

#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
59
#include <linux/bvec.h>
J
Jens Axboe 已提交
60 61 62
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
J
Jens Axboe 已提交
63
#include <net/scm.h>
J
Jens Axboe 已提交
64 65 66 67
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
68
#include <linux/highmem.h>
69
#include <linux/fsnotify.h>
J
Jens Axboe 已提交
70
#include <linux/fadvise.h>
71
#include <linux/task_work.h>
72
#include <linux/io_uring.h>
73
#include <linux/audit.h>
74
#include <linux/security.h>
J
Jens Axboe 已提交
75

76 77 78
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>

J
Jens Axboe 已提交
79 80
#include <uapi/linux/io_uring.h>

81
#include "io-wq.h"
J
Jens Axboe 已提交
82

J
Jens Axboe 已提交
83
#include "io_uring_types.h"
84
#include "io_uring.h"
85
#include "opdef.h"
86
#include "refs.h"
87
#include "tctx.h"
88
#include "sqpoll.h"
89
#include "fdinfo.h"
90
#include "kbuf.h"
91
#include "rsrc.h"
J
Jens Axboe 已提交
92

93
#include "timeout.h"
94
#include "poll.h"
95

96
#define IORING_MAX_ENTRIES	32768
97
#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
98

99 100
#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
				 IORING_REGISTER_LAST + IORING_OP_LAST)
J
Jens Axboe 已提交
101

102 103 104
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)

105 106
#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
107

108
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
109 110
				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
				REQ_F_ASYNC_DATA)
111

112 113 114
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
				 IO_REQ_CLEAN_FLAGS)

115 116
#define IO_TCTX_REFS_CACHE_NR	(1U << 10)

117
#define IO_COMPL_BATCH			32
P
Pavel Begunkov 已提交
118
#define IO_REQ_ALLOC_BATCH		8
119

120 121
enum {
	IO_CHECK_CQ_OVERFLOW_BIT,
122
	IO_CHECK_CQ_DROPPED_BIT,
123 124
};

125 126 127
struct io_defer_entry {
	struct list_head	list;
	struct io_kiocb		*req;
128
	u32			seq;
J
Jens Axboe 已提交
129 130
};

131 132
/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
133
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
134

135 136
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
					 struct task_struct *task,
137
					 bool cancel_all);
138

139
static void io_dismantle_req(struct io_kiocb *req);
140
static void io_clean_op(struct io_kiocb *req);
P
Pavel Begunkov 已提交
141
static void io_queue_sqe(struct io_kiocb *req);
142

143
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
144

145
static void io_eventfd_signal(struct io_ring_ctx *ctx);
146

J
Jens Axboe 已提交
147 148 149 150 151
static struct kmem_cache *req_cachep;

struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
152
	if (io_is_uring_fops(file)) {
J
Jens Axboe 已提交
153 154 155 156 157 158 159 160 161
		struct io_ring_ctx *ctx = file->private_data;

		return ctx->ring_sock->sk;
	}
#endif
	return NULL;
}
EXPORT_SYMBOL(io_uring_get_socket);

162 163
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
164
	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
165 166 167
		__io_submit_flush_completions(ctx);
}

168 169 170 171 172 173 174 175 176
static bool io_match_linked(struct io_kiocb *head)
{
	struct io_kiocb *req;

	io_for_each_link(req, head) {
		if (req->flags & REQ_F_INFLIGHT)
			return true;
	}
	return false;
177 178 179 180 181 182
}

/*
 * As io_match_task() but protected against racing with linked timeouts.
 * User must not hold timeout_lock.
 */
183 184
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
			bool cancel_all)
185
{
186 187
	bool matched;

188 189
	if (task && head->task != task)
		return false;
190 191 192 193 194 195 196 197 198 199 200 201 202 203
	if (cancel_all)
		return true;

	if (head->flags & REQ_F_LINK_TIMEOUT) {
		struct io_ring_ctx *ctx = head->ctx;

		/* protect against races with linked timeouts */
		spin_lock_irq(&ctx->timeout_lock);
		matched = io_match_linked(head);
		spin_unlock_irq(&ctx->timeout_lock);
	} else {
		matched = io_match_linked(head);
	}
	return matched;
204 205
}

206 207 208
static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
	req_set_fail(req);
209
	io_req_set_res(req, res, 0);
210 211
}

212 213 214
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
215 216
}

P
Pavel Begunkov 已提交
217
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
J
Jens Axboe 已提交
218 219 220
{
	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);

221
	complete(&ctx->ref_comp);
J
Jens Axboe 已提交
222 223
}

P
Pavel Begunkov 已提交
224
static __cold void io_fallback_req_func(struct work_struct *work)
225 226 227 228 229
{
	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
						fallback_work.work);
	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
	struct io_kiocb *req, *tmp;
230
	bool locked = false;
231 232 233

	percpu_ref_get(&ctx->refs);
	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
234
		req->io_task_work.func(req, &locked);
235

236
	if (locked) {
237
		io_submit_flush_completions(ctx);
238 239
		mutex_unlock(&ctx->uring_lock);
	}
240 241 242
	percpu_ref_put(&ctx->refs);
}

P
Pavel Begunkov 已提交
243
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
J
Jens Axboe 已提交
244 245
{
	struct io_ring_ctx *ctx;
246
	int hash_bits;
J
Jens Axboe 已提交
247 248 249 250 251

	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
	if (!ctx)
		return NULL;

252 253
	xa_init(&ctx->io_bl_xa);

254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
	/*
	 * Use 5 bits less than the max cq entries, that should give us around
	 * 32 entries per hash list if totally full and uniformly spread.
	 */
	hash_bits = ilog2(p->cq_entries);
	hash_bits -= 5;
	if (hash_bits <= 0)
		hash_bits = 1;
	ctx->cancel_hash_bits = hash_bits;
	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
					GFP_KERNEL);
	if (!ctx->cancel_hash)
		goto err;
	__hash_init(ctx->cancel_hash, 1U << hash_bits);

269 270 271 272 273 274
	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
	if (!ctx->dummy_ubuf)
		goto err;
	/* set invalid range, so io_import_fixed() fails meeting it */
	ctx->dummy_ubuf->ubuf = -1UL;

275
	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
276 277
			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
		goto err;
J
Jens Axboe 已提交
278 279

	ctx->flags = p->flags;
280
	init_waitqueue_head(&ctx->sqo_sq_wait);
281
	INIT_LIST_HEAD(&ctx->sqd_list);
282
	INIT_LIST_HEAD(&ctx->cq_overflow_list);
283
	INIT_LIST_HEAD(&ctx->io_buffers_cache);
284
	INIT_LIST_HEAD(&ctx->apoll_cache);
285
	init_completion(&ctx->ref_comp);
286
	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
J
Jens Axboe 已提交
287
	mutex_init(&ctx->uring_lock);
P
Pavel Begunkov 已提交
288
	init_waitqueue_head(&ctx->cq_wait);
J
Jens Axboe 已提交
289
	spin_lock_init(&ctx->completion_lock);
290
	spin_lock_init(&ctx->timeout_lock);
291
	INIT_WQ_LIST(&ctx->iopoll_list);
292 293
	INIT_LIST_HEAD(&ctx->io_buffers_pages);
	INIT_LIST_HEAD(&ctx->io_buffers_comp);
294
	INIT_LIST_HEAD(&ctx->defer_list);
J
Jens Axboe 已提交
295
	INIT_LIST_HEAD(&ctx->timeout_list);
296
	INIT_LIST_HEAD(&ctx->ltimeout_list);
297 298
	spin_lock_init(&ctx->rsrc_ref_lock);
	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
299 300
	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
	init_llist_head(&ctx->rsrc_put_llist);
301
	INIT_LIST_HEAD(&ctx->tctx_list);
302 303
	ctx->submit_state.free_list.next = NULL;
	INIT_WQ_LIST(&ctx->locked_free_list);
304
	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
305
	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
J
Jens Axboe 已提交
306
	return ctx;
307
err:
308
	kfree(ctx->dummy_ubuf);
309
	kfree(ctx->cancel_hash);
310 311
	kfree(ctx->io_bl);
	xa_destroy(&ctx->io_bl_xa);
312 313
	kfree(ctx);
	return NULL;
J
Jens Axboe 已提交
314 315
}

316 317 318 319 320 321 322 323
static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{
	struct io_rings *r = ctx->rings;

	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
	ctx->cq_extra--;
}

324
static bool req_need_defer(struct io_kiocb *req, u32 seq)
325
{
326 327
	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
		struct io_ring_ctx *ctx = req->ctx;
328

329
		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
330
	}
331

B
Bob Liu 已提交
332
	return false;
333 334
}

335 336 337 338
static inline void io_req_track_inflight(struct io_kiocb *req)
{
	if (!(req->flags & REQ_F_INFLIGHT)) {
		req->flags |= REQ_F_INFLIGHT;
339
		atomic_inc(&req->task->io_uring->inflight_tracked);
340 341 342
	}
}

343 344
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
345 346 347
	if (WARN_ON_ONCE(!req->link))
		return NULL;

348 349
	req->flags &= ~REQ_F_ARM_LTIMEOUT;
	req->flags |= REQ_F_LINK_TIMEOUT;
350 351

	/* linked timeouts should have two refs once prep'ed */
352
	io_req_set_refcount(req);
353 354
	__io_req_set_refcount(req->link, 2);
	return req->link;
355 356 357 358
}

static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
359
	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
360 361 362 363
		return NULL;
	return __io_prep_linked_timeout(req);
}

364 365 366 367 368 369 370 371 372 373 374
static noinline void __io_arm_ltimeout(struct io_kiocb *req)
{
	io_queue_linked_timeout(__io_prep_linked_timeout(req));
}

static inline void io_arm_ltimeout(struct io_kiocb *req)
{
	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
		__io_arm_ltimeout(req);
}

J
Jens Axboe 已提交
375 376 377 378 379
static void io_prep_async_work(struct io_kiocb *req)
{
	const struct io_op_def *def = &io_op_defs[req->opcode];
	struct io_ring_ctx *ctx = req->ctx;

380 381
	if (!(req->flags & REQ_F_CREDS)) {
		req->flags |= REQ_F_CREDS;
382
		req->creds = get_current_cred();
383
	}
384

385 386
	req->work.list.next = NULL;
	req->work.flags = 0;
387
	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
388 389 390
	if (req->flags & REQ_F_FORCE_ASYNC)
		req->work.flags |= IO_WQ_WORK_CONCURRENT;

J
Jens Axboe 已提交
391 392 393
	if (req->flags & REQ_F_ISREG) {
		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
			io_wq_hash_work(&req->work, file_inode(req->file));
394
	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
J
Jens Axboe 已提交
395 396 397
		if (def->unbound_nonreg_file)
			req->work.flags |= IO_WQ_WORK_UNBOUND;
	}
398
}
399

400
static void io_prep_async_link(struct io_kiocb *req)
401
{
402
	struct io_kiocb *cur;
403

404 405 406
	if (req->flags & REQ_F_LINK_TIMEOUT) {
		struct io_ring_ctx *ctx = req->ctx;

407
		spin_lock_irq(&ctx->timeout_lock);
408 409
		io_for_each_link(cur, req)
			io_prep_async_work(cur);
410
		spin_unlock_irq(&ctx->timeout_lock);
411 412 413 414
	} else {
		io_for_each_link(cur, req)
			io_prep_async_work(cur);
	}
415 416
}

417
void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
418
{
419
	struct io_kiocb *link = io_prep_linked_timeout(req);
420
	struct io_uring_task *tctx = req->task->io_uring;
421

422 423
	BUG_ON(!tctx);
	BUG_ON(!tctx->io_wq);
424

425 426
	/* init ->work of the whole link before punting */
	io_prep_async_link(req);
427 428 429 430 431 432 433 434 435 436 437

	/*
	 * Not expected to happen, but if we do have a bug where this _can_
	 * happen, catch it here and ensure the request is marked as
	 * canceled. That will make io-wq go through the usual work cancel
	 * procedure rather than attempt to run this request (or create a new
	 * worker for it).
	 */
	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
		req->work.flags |= IO_WQ_WORK_CANCEL;

438 439 440
	trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
					req->opcode, req->flags, &req->work,
					io_wq_is_hashed(&req->work));
441
	io_wq_enqueue(tctx->io_wq, &req->work);
442 443
	if (link)
		io_queue_linked_timeout(link);
444 445
}

P
Pavel Begunkov 已提交
446
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
447
{
448
	while (!list_empty(&ctx->defer_list)) {
449 450
		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
						struct io_defer_entry, list);
451

452
		if (req_need_defer(de->req, de->seq))
453
			break;
454
		list_del_init(&de->list);
455
		io_req_task_queue(de->req);
456
		kfree(de);
457
	}
458 459
}

460
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
461
{
462 463 464 465 466 467 468 469 470 471 472
	if (ctx->off_timeout_used || ctx->drain_active) {
		spin_lock(&ctx->completion_lock);
		if (ctx->off_timeout_used)
			io_flush_timeouts(ctx);
		if (ctx->drain_active)
			io_queue_deferred(ctx);
		io_commit_cqring(ctx);
		spin_unlock(&ctx->completion_lock);
	}
	if (ctx->has_evfd)
		io_eventfd_signal(ctx);
473 474
}

475
static void io_eventfd_signal(struct io_ring_ctx *ctx)
476
{
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
	struct io_ev_fd *ev_fd;

	rcu_read_lock();
	/*
	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
	 * and eventfd_signal
	 */
	ev_fd = rcu_dereference(ctx->io_ev_fd);

	/*
	 * Check again if ev_fd exists incase an io_eventfd_unregister call
	 * completed between the NULL check of ctx->io_ev_fd at the start of
	 * the function and rcu_read_lock.
	 */
	if (unlikely(!ev_fd))
		goto out;
493
	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
494 495
		goto out;

496
	if (!ev_fd->eventfd_async || io_wq_current_is_worker())
497 498 499
		eventfd_signal(ev_fd->cq_ev_fd, 1);
out:
	rcu_read_unlock();
500 501
}

502 503 504 505 506 507 508
/*
 * This should only get called when at least one event has been posted.
 * Some applications rely on the eventfd notification count only changing
 * IFF a new CQE has been added to the CQ ring. There's no depedency on
 * 1:1 relationship between how many times this function is called (and
 * hence the eventfd count) and number of CQEs posted to the CQ ring.
 */
509
void io_cqring_ev_posted(struct io_ring_ctx *ctx)
510
{
511 512
	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
		     ctx->has_evfd))
513 514
		__io_commit_cqring_flush(ctx);

515
	io_cqring_wake(ctx);
516 517
}

518
/* Returns true if there are no backlogged entries after the flush */
519
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
520
{
521
	bool all_flushed, posted;
522
	size_t cqe_size = sizeof(struct io_uring_cqe);
523

524
	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
525
		return false;
526

527 528 529
	if (ctx->flags & IORING_SETUP_CQE32)
		cqe_size <<= 1;

530
	posted = false;
531
	spin_lock(&ctx->completion_lock);
532
	while (!list_empty(&ctx->cq_overflow_list)) {
P
Pavel Begunkov 已提交
533
		struct io_uring_cqe *cqe = io_get_cqe(ctx);
534
		struct io_overflow_cqe *ocqe;
535

536 537
		if (!cqe && !force)
			break;
538 539 540
		ocqe = list_first_entry(&ctx->cq_overflow_list,
					struct io_overflow_cqe, list);
		if (cqe)
541
			memcpy(cqe, &ocqe->cqe, cqe_size);
542
		else
543 544
			io_account_cq_overflow(ctx);

545
		posted = true;
546 547
		list_del(&ocqe->list);
		kfree(ocqe);
548 549
	}

550 551
	all_flushed = list_empty(&ctx->cq_overflow_list);
	if (all_flushed) {
552
		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
553
		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
554
	}
555

556
	io_commit_cqring(ctx);
557
	spin_unlock(&ctx->completion_lock);
558 559
	if (posted)
		io_cqring_ev_posted(ctx);
560
	return all_flushed;
561 562
}

563
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
564
{
565 566
	bool ret = true;

567
	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
568 569 570
		/* iopoll syncs against uring_lock, not completion_lock */
		if (ctx->flags & IORING_SETUP_IOPOLL)
			mutex_lock(&ctx->uring_lock);
571
		ret = __io_cqring_overflow_flush(ctx, false);
572 573 574
		if (ctx->flags & IORING_SETUP_IOPOLL)
			mutex_unlock(&ctx->uring_lock);
	}
575 576

	return ret;
577 578
}

579
static void __io_put_task(struct task_struct *task, int nr)
580 581 582
{
	struct io_uring_task *tctx = task->io_uring;

583 584 585 586 587 588 589 590 591 592 593 594 595
	percpu_counter_sub(&tctx->inflight, nr);
	if (unlikely(atomic_read(&tctx->in_idle)))
		wake_up(&tctx->wait);
	put_task_struct_many(task, nr);
}

/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
	if (likely(task == current))
		task->io_uring->cached_refs += nr;
	else
		__io_put_task(task, nr);
596 597
}

598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
static void io_task_refs_refill(struct io_uring_task *tctx)
{
	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;

	percpu_counter_add(&tctx->inflight, refill);
	refcount_add(refill, &current->usage);
	tctx->cached_refs += refill;
}

static inline void io_get_task_refs(int nr)
{
	struct io_uring_task *tctx = current->io_uring;

	tctx->cached_refs -= nr;
	if (unlikely(tctx->cached_refs < 0))
		io_task_refs_refill(tctx);
}

616 617 618 619 620 621 622 623 624 625 626 627
static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
	struct io_uring_task *tctx = task->io_uring;
	unsigned int refs = tctx->cached_refs;

	if (refs) {
		tctx->cached_refs = 0;
		percpu_counter_sub(&tctx->inflight, refs);
		put_task_struct_many(task, refs);
	}
}

628 629
bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
			      u32 cflags, u64 extra1, u64 extra2)
J
Jens Axboe 已提交
630
{
631
	struct io_overflow_cqe *ocqe;
632 633
	size_t ocq_size = sizeof(struct io_overflow_cqe);
	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
J
Jens Axboe 已提交
634

635 636
	if (is_cqe32)
		ocq_size += sizeof(struct io_uring_cqe);
J
Jens Axboe 已提交
637

638
	ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
D
Dylan Yudaken 已提交
639
	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
640 641 642 643 644 645
	if (!ocqe) {
		/*
		 * If we're in ring overflow flush mode, or in task cancel mode,
		 * or cannot allocate an overflow entry, then we need to drop it
		 * on the floor.
		 */
646
		io_account_cq_overflow(ctx);
647
		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
648
		return false;
J
Jens Axboe 已提交
649
	}
650
	if (list_empty(&ctx->cq_overflow_list)) {
651
		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
652
		atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
653

654
	}
655
	ocqe->cqe.user_data = user_data;
656 657
	ocqe->cqe.res = res;
	ocqe->cqe.flags = cflags;
658 659 660 661
	if (is_cqe32) {
		ocqe->cqe.big_cqe[0] = extra1;
		ocqe->cqe.big_cqe[1] = extra2;
	}
662 663
	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
	return true;
J
Jens Axboe 已提交
664 665
}

666 667
bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
		     u32 cflags)
668
{
669 670
	struct io_uring_cqe *cqe;

671
	ctx->cq_extra++;
672
	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
673 674 675 676 677 678 679 680 681 682 683

	/*
	 * If we can't get a cq entry, userspace overflowed the
	 * submission (by quite a lot). Increment the overflow count in
	 * the ring.
	 */
	cqe = io_get_cqe(ctx);
	if (likely(cqe)) {
		WRITE_ONCE(cqe->user_data, user_data);
		WRITE_ONCE(cqe->res, res);
		WRITE_ONCE(cqe->flags, cflags);
684 685 686 687 688

		if (ctx->flags & IORING_SETUP_CQE32) {
			WRITE_ONCE(cqe->big_cqe[0], 0);
			WRITE_ONCE(cqe->big_cqe[1], 0);
		}
689 690 691
		return true;
	}
	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
692 693
}

694
static void __io_req_complete_put(struct io_kiocb *req)
J
Jens Axboe 已提交
695
{
696 697 698 699
	/*
	 * If we're the last reference to this request, add to our locked
	 * free_list cache.
	 */
700
	if (req_ref_put_and_test(req)) {
701 702
		struct io_ring_ctx *ctx = req->ctx;

703
		if (req->flags & IO_REQ_LINK_FLAGS) {
704
			if (req->flags & IO_DISARM_MASK)
705 706 707 708 709 710
				io_disarm_next(req);
			if (req->link) {
				io_req_task_queue(req->link);
				req->link = NULL;
			}
		}
711
		io_req_put_rsrc(req);
712 713 714 715 716 717
		/*
		 * Selected buffer deallocation in io_clean_op() assumes that
		 * we don't hold ->completion_lock. Clean them here to avoid
		 * deadlocks.
		 */
		io_put_kbuf_comp(req);
718 719
		io_dismantle_req(req);
		io_put_task(req->task, 1);
720
		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
721
		ctx->locked_free_nr++;
722
	}
723 724
}

725
void __io_req_complete_post(struct io_kiocb *req)
726
{
727
	if (!(req->flags & REQ_F_CQE_SKIP))
728
		__io_fill_cqe_req(req->ctx, req);
729 730 731
	__io_req_complete_put(req);
}

732
void io_req_complete_post(struct io_kiocb *req)
733 734 735 736
{
	struct io_ring_ctx *ctx = req->ctx;

	spin_lock(&ctx->completion_lock);
737
	__io_req_complete_post(req);
738
	io_commit_cqring(ctx);
739
	spin_unlock(&ctx->completion_lock);
740
	io_cqring_ev_posted(ctx);
741 742
}

743
inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
744
{
745
	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
746
		req->flags |= REQ_F_COMPLETE_INLINE;
747 748
	else
		io_req_complete_post(req);
749 750
}

751
void io_req_complete_failed(struct io_kiocb *req, s32 res)
752
{
753
	req_set_fail(req);
754 755
	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
	io_req_complete_post(req);
756 757
}

P
Pavel Begunkov 已提交
758 759 760 761 762 763 764 765 766 767
/*
 * Don't initialise the fields below on every allocation, but do that in
 * advance and keep them valid across allocations.
 */
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
	req->ctx = ctx;
	req->link = NULL;
	req->async_data = NULL;
	/* not necessary, but safer to zero */
768
	req->cqe.res = 0;
P
Pavel Begunkov 已提交
769 770
}

771
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
772
					struct io_submit_state *state)
773
{
774
	spin_lock(&ctx->completion_lock);
775
	wq_list_splice(&ctx->locked_free_list, &state->free_list);
776
	ctx->locked_free_nr = 0;
777
	spin_unlock(&ctx->completion_lock);
778 779
}

780
static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
781
{
782
	return !ctx->submit_state.free_list.next;
783 784
}

785 786 787 788 789 790
/*
 * A request might get retired back into the request caches even before opcode
 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 * Because of that, io_alloc_req() should be called only under ->uring_lock
 * and with extra caution to not get a request that is still worked on.
 */
P
Pavel Begunkov 已提交
791
static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
792
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
793
{
P
Pavel Begunkov 已提交
794
	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
795
	void *reqs[IO_REQ_ALLOC_BATCH];
P
Pavel Begunkov 已提交
796
	int ret, i;
797

798 799 800 801 802
	/*
	 * If we have more than a batch's worth of requests in our IRQ side
	 * locked cache, grab the lock and move them over to our submission
	 * side cache.
	 */
803
	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
804
		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
805
		if (!io_req_cache_empty(ctx))
806 807
			return true;
	}
808

809
	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
810

P
Pavel Begunkov 已提交
811 812 813 814 815
	/*
	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
	 * retry single alloc to be on the safe side.
	 */
	if (unlikely(ret <= 0)) {
816 817
		reqs[0] = kmem_cache_alloc(req_cachep, gfp);
		if (!reqs[0])
818
			return false;
P
Pavel Begunkov 已提交
819
		ret = 1;
J
Jens Axboe 已提交
820
	}
P
Pavel Begunkov 已提交
821

822
	percpu_ref_get_many(&ctx->refs, ret);
823
	for (i = 0; i < ret; i++) {
824
		struct io_kiocb *req = reqs[i];
825 826

		io_preinit_req(req, ctx);
827
		io_req_add_to_cache(req, ctx);
828
	}
829 830 831 832 833
	return true;
}

static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
{
834
	if (unlikely(io_req_cache_empty(ctx)))
835 836 837 838 839 840 841 842 843
		return __io_alloc_req_refill(ctx);
	return true;
}

static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
{
	struct io_wq_work_node *node;

	node = wq_stack_extract(&ctx->submit_state.free_list);
844
	return container_of(node, struct io_kiocb, comp_list);
J
Jens Axboe 已提交
845 846
}

P
Pavel Begunkov 已提交
847
static inline void io_dismantle_req(struct io_kiocb *req)
J
Jens Axboe 已提交
848
{
849
	unsigned int flags = req->flags;
850

851
	if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
852
		io_clean_op(req);
853 854
	if (!(flags & REQ_F_FIXED_FILE))
		io_put_file(req->file);
855 856
}

857
__cold void io_free_req(struct io_kiocb *req)
858
{
859
	struct io_ring_ctx *ctx = req->ctx;
860

861
	io_req_put_rsrc(req);
862
	io_dismantle_req(req);
863
	io_put_task(req->task, 1);
864

865
	spin_lock(&ctx->completion_lock);
866
	wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
867
	ctx->locked_free_nr++;
868
	spin_unlock(&ctx->completion_lock);
869 870
}

871 872 873 874 875 876 877
static void __io_req_find_next_prep(struct io_kiocb *req)
{
	struct io_ring_ctx *ctx = req->ctx;
	bool posted;

	spin_lock(&ctx->completion_lock);
	posted = io_disarm_next(req);
878
	io_commit_cqring(ctx);
879 880 881 882 883 884
	spin_unlock(&ctx->completion_lock);
	if (posted)
		io_cqring_ev_posted(ctx);
}

static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
885
{
886
	struct io_kiocb *nxt;
887

J
Jens Axboe 已提交
888 889 890 891 892 893
	/*
	 * If LINK is set, we have dependent requests in this chain. If we
	 * didn't fail this request, queue the first one up, moving any other
	 * dependencies to the next request. In case of failure, fail the rest
	 * of the chain.
	 */
894 895
	if (unlikely(req->flags & IO_DISARM_MASK))
		__io_req_find_next_prep(req);
896 897 898
	nxt = req->link;
	req->link = NULL;
	return nxt;
899
}
J
Jens Axboe 已提交
900

901
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
902 903 904
{
	if (!ctx)
		return;
905 906
	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
907
	if (*locked) {
908
		io_submit_flush_completions(ctx);
909
		mutex_unlock(&ctx->uring_lock);
910
		*locked = false;
911 912 913 914
	}
	percpu_ref_put(&ctx->refs);
}

915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932
static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
{
	io_commit_cqring(ctx);
	spin_unlock(&ctx->completion_lock);
	io_cqring_ev_posted(ctx);
}

static void handle_prev_tw_list(struct io_wq_work_node *node,
				struct io_ring_ctx **ctx, bool *uring_locked)
{
	if (*ctx && !*uring_locked)
		spin_lock(&(*ctx)->completion_lock);

	do {
		struct io_wq_work_node *next = node->next;
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);

933 934
		prefetch(container_of(next, struct io_kiocb, io_task_work.node));

935 936 937 938 939 940 941 942 943 944 945 946
		if (req->ctx != *ctx) {
			if (unlikely(!*uring_locked && *ctx))
				ctx_commit_and_unlock(*ctx);

			ctx_flush_and_put(*ctx, uring_locked);
			*ctx = req->ctx;
			/* if not contended, grab and improve batching */
			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
			percpu_ref_get(&(*ctx)->refs);
			if (unlikely(!*uring_locked))
				spin_lock(&(*ctx)->completion_lock);
		}
947
		if (likely(*uring_locked)) {
948
			req->io_task_work.func(req, uring_locked);
949 950 951 952
		} else {
			req->cqe.flags = io_put_kbuf_comp(req);
			__io_req_complete_post(req);
		}
953 954 955 956 957 958 959 960 961
		node = next;
	} while (node);

	if (unlikely(!*uring_locked))
		ctx_commit_and_unlock(*ctx);
}

static void handle_tw_list(struct io_wq_work_node *node,
			   struct io_ring_ctx **ctx, bool *locked)
962 963 964 965 966 967
{
	do {
		struct io_wq_work_node *next = node->next;
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    io_task_work.node);

968 969
		prefetch(container_of(next, struct io_kiocb, io_task_work.node));

970 971 972 973 974 975 976 977 978 979 980 981
		if (req->ctx != *ctx) {
			ctx_flush_and_put(*ctx, locked);
			*ctx = req->ctx;
			/* if not contended, grab and improve batching */
			*locked = mutex_trylock(&(*ctx)->uring_lock);
			percpu_ref_get(&(*ctx)->refs);
		}
		req->io_task_work.func(req, locked);
		node = next;
	} while (node);
}

982
void tctx_task_work(struct callback_head *cb)
983
{
984
	bool uring_locked = false;
985
	struct io_ring_ctx *ctx = NULL;
986 987
	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
						  task_work);
988

989
	while (1) {
990
		struct io_wq_work_node *node1, *node2;
991 992

		spin_lock_irq(&tctx->task_lock);
993
		node1 = tctx->prio_task_list.first;
994
		node2 = tctx->task_list.first;
995
		INIT_WQ_LIST(&tctx->task_list);
996
		INIT_WQ_LIST(&tctx->prio_task_list);
997
		if (!node2 && !node1)
998
			tctx->task_running = false;
999
		spin_unlock_irq(&tctx->task_lock);
1000
		if (!node2 && !node1)
1001
			break;
1002

1003 1004 1005 1006
		if (node1)
			handle_prev_tw_list(node1, &ctx, &uring_locked);
		if (node2)
			handle_tw_list(node2, &ctx, &uring_locked);
1007
		cond_resched();
1008

1009
		if (data_race(!tctx->task_list.first) &&
1010
		    data_race(!tctx->prio_task_list.first) && uring_locked)
1011
			io_submit_flush_completions(ctx);
1012
	}
1013

1014
	ctx_flush_and_put(ctx, &uring_locked);
1015 1016 1017 1018

	/* relaxed read is enough as only the task itself sets ->in_idle */
	if (unlikely(atomic_read(&tctx->in_idle)))
		io_uring_drop_tctx_refs(current);
1019 1020
}

1021 1022 1023
static void __io_req_task_work_add(struct io_kiocb *req,
				   struct io_uring_task *tctx,
				   struct io_wq_work_list *list)
1024
{
1025
	struct io_ring_ctx *ctx = req->ctx;
1026
	struct io_wq_work_node *node;
1027
	unsigned long flags;
1028
	bool running;
1029

1030
	spin_lock_irqsave(&tctx->task_lock, flags);
1031
	wq_list_add_tail(&req->io_task_work.node, list);
1032 1033 1034
	running = tctx->task_running;
	if (!running)
		tctx->task_running = true;
1035
	spin_unlock_irqrestore(&tctx->task_lock, flags);
1036 1037

	/* task_work already pending, we're done */
1038
	if (running)
1039
		return;
1040

1041 1042 1043
	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);

1044
	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
1045
		return;
1046

1047
	spin_lock_irqsave(&tctx->task_lock, flags);
1048
	tctx->task_running = false;
1049
	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
1050
	spin_unlock_irqrestore(&tctx->task_lock, flags);
1051

1052 1053 1054 1055 1056 1057 1058
	while (node) {
		req = container_of(node, struct io_kiocb, io_task_work.node);
		node = node->next;
		if (llist_add(&req->io_task_work.fallback_node,
			      &req->ctx->fallback_llist))
			schedule_delayed_work(&req->ctx->fallback_work, 1);
	}
1059 1060
}

1061
void io_req_task_work_add(struct io_kiocb *req)
1062 1063 1064 1065 1066 1067
{
	struct io_uring_task *tctx = req->task->io_uring;

	__io_req_task_work_add(req, tctx, &tctx->task_list);
}

1068
void io_req_task_prio_work_add(struct io_kiocb *req)
1069 1070 1071 1072 1073 1074 1075 1076 1077
{
	struct io_uring_task *tctx = req->task->io_uring;

	if (req->ctx->flags & IORING_SETUP_SQPOLL)
		__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
	else
		__io_req_task_work_add(req, tctx, &tctx->task_list);
}

1078
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
1079
{
1080
	io_req_complete_post(req);
1081
}
1082

1083
void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
1084
{
1085
	io_req_set_res(req, res, cflags);
1086
	req->io_task_work.func = io_req_tw_post;
1087
	io_req_task_work_add(req);
1088 1089
}

1090
static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
1091
{
1092
	/* not needed for normal modes, but SQPOLL depends on it */
1093
	io_tw_lock(req->ctx, locked);
1094
	io_req_complete_failed(req, req->cqe.res);
1095 1096
}

1097
void io_req_task_submit(struct io_kiocb *req, bool *locked)
1098
{
1099
	io_tw_lock(req->ctx, locked);
1100
	/* req->task == current here, checking PF_EXITING is safe */
1101
	if (likely(!(req->task->flags & PF_EXITING)))
P
Pavel Begunkov 已提交
1102
		io_queue_sqe(req);
1103
	else
1104
		io_req_complete_failed(req, -EFAULT);
1105 1106
}

1107
void io_req_task_queue_fail(struct io_kiocb *req, int ret)
1108
{
1109
	io_req_set_res(req, ret, 0);
1110
	req->io_task_work.func = io_req_task_cancel;
1111
	io_req_task_work_add(req);
1112 1113
}

1114
void io_req_task_queue(struct io_kiocb *req)
1115
{
1116
	req->io_task_work.func = io_req_task_submit;
1117
	io_req_task_work_add(req);
1118 1119
}

1120
void io_queue_next(struct io_kiocb *req)
1121
{
1122
	struct io_kiocb *nxt = io_req_find_next(req);
1123 1124

	if (nxt)
1125
		io_req_task_queue(nxt);
1126 1127
}

1128
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
1129
	__must_hold(&ctx->uring_lock)
1130
{
1131
	struct task_struct *task = NULL;
1132
	int task_refs = 0;
1133

1134 1135 1136
	do {
		struct io_kiocb *req = container_of(node, struct io_kiocb,
						    comp_list);
1137

1138 1139 1140 1141 1142 1143
		if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
			if (req->flags & REQ_F_REFCOUNT) {
				node = req->comp_list.next;
				if (!req_ref_put_and_test(req))
					continue;
			}
1144 1145 1146 1147 1148 1149 1150 1151 1152
			if ((req->flags & REQ_F_POLLED) && req->apoll) {
				struct async_poll *apoll = req->apoll;

				if (apoll->double_poll)
					kfree(apoll->double_poll);
				list_add(&apoll->poll.wait.entry,
						&ctx->apoll_cache);
				req->flags &= ~REQ_F_POLLED;
			}
1153
			if (req->flags & IO_REQ_LINK_FLAGS)
1154
				io_queue_next(req);
1155 1156
			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
				io_clean_op(req);
1157
		}
1158 1159
		if (!(req->flags & REQ_F_FIXED_FILE))
			io_put_file(req->file);
1160

1161
		io_req_put_rsrc_locked(req, ctx);
1162

1163 1164 1165 1166 1167 1168 1169
		if (req->task != task) {
			if (task)
				io_put_task(task, task_refs);
			task = req->task;
			task_refs = 0;
		}
		task_refs++;
1170
		node = req->comp_list.next;
1171
		io_req_add_to_cache(req, ctx);
1172
	} while (node);
1173 1174 1175

	if (task)
		io_put_task(task, task_refs);
1176 1177
}

1178
static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
1179
	__must_hold(&ctx->uring_lock)
1180
{
1181
	struct io_wq_work_node *node, *prev;
1182
	struct io_submit_state *state = &ctx->submit_state;
1183

1184 1185 1186 1187
	if (state->flush_cqes) {
		spin_lock(&ctx->completion_lock);
		wq_list_for_each(node, prev, &state->compl_reqs) {
			struct io_kiocb *req = container_of(node, struct io_kiocb,
1188
						    comp_list);
1189

1190 1191
			if (!(req->flags & REQ_F_CQE_SKIP))
				__io_fill_cqe_req(ctx, req);
1192 1193 1194 1195 1196 1197
		}

		io_commit_cqring(ctx);
		spin_unlock(&ctx->completion_lock);
		io_cqring_ev_posted(ctx);
		state->flush_cqes = false;
1198
	}
1199

1200
	io_free_batch_list(ctx, state->compl_reqs.first);
1201
	INIT_WQ_LIST(&state->compl_reqs);
1202 1203
}

1204 1205 1206 1207
/*
 * Drop reference to request, return next in chain (if there is one) if this
 * was the last reference to this request.
 */
1208
static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
1209
{
1210 1211
	struct io_kiocb *nxt = NULL;

1212
	if (req_ref_put_and_test(req)) {
1213
		if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
1214
			nxt = io_req_find_next(req);
P
Pavel Begunkov 已提交
1215
		io_free_req(req);
1216
	}
1217
	return nxt;
J
Jens Axboe 已提交
1218 1219
}

1220
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
1221 1222 1223
{
	/* See comment at the top of this file */
	smp_rmb();
1224
	return __io_cqring_events(ctx);
1225 1226
}

J
Jens Axboe 已提交
1227 1228 1229 1230
/*
 * We can't just wait for polled events to come to us, we have to actively
 * find and complete them.
 */
P
Pavel Begunkov 已提交
1231
static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
1232 1233 1234 1235 1236
{
	if (!(ctx->flags & IORING_SETUP_IOPOLL))
		return;

	mutex_lock(&ctx->uring_lock);
1237
	while (!wq_list_empty(&ctx->iopoll_list)) {
1238
		/* let it sleep and repeat later if can't complete a request */
1239
		if (io_do_iopoll(ctx, true) == 0)
1240
			break;
1241 1242 1243
		/*
		 * Ensure we allow local-to-the-cpu processing to take place,
		 * in this case we need to ensure that we reap all events.
1244
		 * Also let task_work, etc. to progress by releasing the mutex
1245
		 */
1246 1247 1248 1249 1250
		if (need_resched()) {
			mutex_unlock(&ctx->uring_lock);
			cond_resched();
			mutex_lock(&ctx->uring_lock);
		}
J
Jens Axboe 已提交
1251 1252 1253 1254
	}
	mutex_unlock(&ctx->uring_lock);
}

1255
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
J
Jens Axboe 已提交
1256
{
1257
	unsigned int nr_events = 0;
1258
	int ret = 0;
1259
	unsigned long check_cq;
1260

1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271
	check_cq = READ_ONCE(ctx->check_cq);
	if (unlikely(check_cq)) {
		if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
			__io_cqring_overflow_flush(ctx, false);
		/*
		 * Similarly do not spin if we have not informed the user of any
		 * dropped CQE.
		 */
		if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
			return -EBADR;
	}
1272 1273 1274 1275 1276 1277
	/*
	 * Don't enter poll loop if we already have events pending.
	 * If we do, we can potentially be spinning for commands that
	 * already triggered a CQE (eg in error).
	 */
	if (io_cqring_events(ctx))
1278
		return 0;
1279

J
Jens Axboe 已提交
1280
	do {
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
		/*
		 * If a submit got punted to a workqueue, we can have the
		 * application entering polling for a command before it gets
		 * issued. That app will hold the uring_lock for the duration
		 * of the poll right here, so we need to take a breather every
		 * now and then to ensure that the issue has a chance to add
		 * the poll to the issued list. Otherwise we can spin here
		 * forever, while the workqueue is stuck trying to acquire the
		 * very same mutex.
		 */
1291
		if (wq_list_empty(&ctx->iopoll_list)) {
1292 1293
			u32 tail = ctx->cached_cq_tail;

1294
			mutex_unlock(&ctx->uring_lock);
1295
			io_run_task_work();
1296
			mutex_lock(&ctx->uring_lock);
J
Jens Axboe 已提交
1297

1298 1299
			/* some requests don't go through iopoll_list */
			if (tail != ctx->cached_cq_tail ||
1300
			    wq_list_empty(&ctx->iopoll_list))
1301
				break;
1302
		}
1303 1304 1305 1306 1307 1308
		ret = io_do_iopoll(ctx, !min);
		if (ret < 0)
			break;
		nr_events += ret;
		ret = 0;
	} while (nr_events < min && !need_resched());
1309

J
Jens Axboe 已提交
1310 1311
	return ret;
}
1312
inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
1313
{
1314
	if (*locked) {
1315
		req->cqe.flags |= io_put_kbuf(req, 0);
1316
		io_req_add_compl_list(req);
1317
	} else {
1318 1319
		req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED);
		io_req_complete_post(req);
1320
	}
1321 1322
}

J
Jens Axboe 已提交
1323 1324 1325
/*
 * After the iocb has been issued, it's safe to be found on the poll list.
 * Adding the kiocb to the list AFTER submission ensures that we don't
1326
 * find it from a io_do_iopoll() thread before the issuer is done
J
Jens Axboe 已提交
1327 1328
 * accessing the kiocb cookie.
 */
1329
static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
J
Jens Axboe 已提交
1330 1331
{
	struct io_ring_ctx *ctx = req->ctx;
H
Hao Xu 已提交
1332
	const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
1333 1334

	/* workqueue context doesn't hold uring_lock, grab it now */
H
Hao Xu 已提交
1335
	if (unlikely(needs_lock))
1336
		mutex_lock(&ctx->uring_lock);
J
Jens Axboe 已提交
1337 1338 1339 1340 1341 1342

	/*
	 * Track whether we have multiple files in our lists. This will impact
	 * how we do polling eventually, not spinning if we're on potentially
	 * different devices.
	 */
1343
	if (wq_list_empty(&ctx->iopoll_list)) {
1344 1345
		ctx->poll_multi_queue = false;
	} else if (!ctx->poll_multi_queue) {
J
Jens Axboe 已提交
1346 1347
		struct io_kiocb *list_req;

1348 1349
		list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
					comp_list);
1350
		if (list_req->file != req->file)
1351
			ctx->poll_multi_queue = true;
J
Jens Axboe 已提交
1352 1353 1354 1355 1356 1357
	}

	/*
	 * For fast devices, IO may have already completed. If it has, add
	 * it to the front so we find it first.
	 */
1358
	if (READ_ONCE(req->iopoll_completed))
1359
		wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
J
Jens Axboe 已提交
1360
	else
1361
		wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
1362

H
Hao Xu 已提交
1363
	if (unlikely(needs_lock)) {
1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375
		/*
		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
		 * in sq thread task context or in io worker task context. If
		 * current task context is sq thread, we don't need to check
		 * whether should wake up sq thread.
		 */
		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
		    wq_has_sleeper(&ctx->sq_data->wait))
			wake_up(&ctx->sq_data->wait);

		mutex_unlock(&ctx->uring_lock);
	}
J
Jens Axboe 已提交
1376 1377
}

1378 1379
static bool io_bdev_nowait(struct block_device *bdev)
{
1380
	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
1381 1382
}

J
Jens Axboe 已提交
1383 1384 1385 1386 1387
/*
 * If we tracked the file through the SCM inflight mechanism, we could support
 * any file. For now, just ensure that anything potentially problematic is done
 * inline.
 */
1388
static bool __io_file_supports_nowait(struct file *file, umode_t mode)
J
Jens Axboe 已提交
1389
{
1390
	if (S_ISBLK(mode)) {
C
Christoph Hellwig 已提交
1391 1392
		if (IS_ENABLED(CONFIG_BLOCK) &&
		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
1393 1394 1395
			return true;
		return false;
	}
1396
	if (S_ISSOCK(mode))
J
Jens Axboe 已提交
1397
		return true;
1398
	if (S_ISREG(mode)) {
C
Christoph Hellwig 已提交
1399 1400
		if (IS_ENABLED(CONFIG_BLOCK) &&
		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
1401
		    !io_is_uring_fops(file))
1402 1403 1404
			return true;
		return false;
	}
J
Jens Axboe 已提交
1405

1406 1407 1408
	/* any ->read/write should understand O_NONBLOCK */
	if (file->f_flags & O_NONBLOCK)
		return true;
1409
	return file->f_mode & FMODE_NOWAIT;
J
Jens Axboe 已提交
1410
}
1411

1412 1413 1414 1415 1416
/*
 * If we tracked the file through the SCM inflight mechanism, we could support
 * any file. For now, just ensure that anything potentially problematic is done
 * inline.
 */
1417
unsigned int io_file_get_flags(struct file *file)
1418 1419 1420
{
	umode_t mode = file_inode(file)->i_mode;
	unsigned int res = 0;
1421

1422 1423 1424 1425
	if (S_ISREG(mode))
		res |= FFS_ISREG;
	if (__io_file_supports_nowait(file, mode))
		res |= FFS_NOWAIT;
1426 1427
	if (io_file_need_scm(file))
		res |= FFS_SCM;
1428
	return res;
J
Jens Axboe 已提交
1429 1430
}

1431
bool io_alloc_async_data(struct io_kiocb *req)
1432
{
1433 1434
	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
1435 1436 1437 1438 1439
	if (req->async_data) {
		req->flags |= REQ_F_ASYNC_DATA;
		return false;
	}
	return true;
1440 1441
}

1442
int io_req_prep_async(struct io_kiocb *req)
1443
{
1444 1445 1446 1447 1448
	const struct io_op_def *def = &io_op_defs[req->opcode];

	/* assign early for deferred execution for non-fixed file */
	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
		req->file = io_file_get_normal(req, req->cqe.fd);
1449
	if (!def->prep_async)
1450 1451 1452 1453 1454 1455
		return 0;
	if (WARN_ON_ONCE(req_has_async_data(req)))
		return -EFAULT;
	if (io_alloc_async_data(req))
		return -EAGAIN;

1456
	return def->prep_async(req);
1457 1458
}

1459 1460
static u32 io_get_sequence(struct io_kiocb *req)
{
1461
	u32 seq = req->ctx->cached_sq_head;
1462
	struct io_kiocb *cur;
1463

1464
	/* need original cached_sq_head, but it was increased for each req */
1465
	io_for_each_link(cur, req)
1466 1467
		seq--;
	return seq;
1468 1469
}

P
Pavel Begunkov 已提交
1470
static __cold void io_drain_req(struct io_kiocb *req)
1471
{
1472
	struct io_ring_ctx *ctx = req->ctx;
1473
	struct io_defer_entry *de;
1474
	int ret;
1475
	u32 seq = io_get_sequence(req);
1476

B
Bob Liu 已提交
1477
	/* Still need defer if there is pending req in defer list. */
1478
	spin_lock(&ctx->completion_lock);
1479
	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
1480
		spin_unlock(&ctx->completion_lock);
1481
queue:
1482
		ctx->drain_active = false;
1483 1484
		io_req_task_queue(req);
		return;
1485
	}
1486
	spin_unlock(&ctx->completion_lock);
1487

1488
	ret = io_req_prep_async(req);
1489 1490 1491 1492 1493
	if (ret) {
fail:
		io_req_complete_failed(req, ret);
		return;
	}
1494
	io_prep_async_link(req);
1495
	de = kmalloc(sizeof(*de), GFP_KERNEL);
1496
	if (!de) {
P
Pavel Begunkov 已提交
1497
		ret = -ENOMEM;
1498
		goto fail;
1499
	}
1500

1501
	spin_lock(&ctx->completion_lock);
1502
	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
1503
		spin_unlock(&ctx->completion_lock);
1504
		kfree(de);
1505
		goto queue;
1506 1507
	}

1508
	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
1509
	de->req = req;
1510
	de->seq = seq;
1511
	list_add_tail(&de->list, &ctx->defer_list);
1512
	spin_unlock(&ctx->completion_lock);
1513 1514
}

1515
static void io_clean_op(struct io_kiocb *req)
P
Pavel Begunkov 已提交
1516
{
1517 1518
	if (req->flags & REQ_F_BUFFER_SELECTED) {
		spin_lock(&req->ctx->completion_lock);
1519
		io_put_kbuf_comp(req);
1520 1521
		spin_unlock(&req->ctx->completion_lock);
	}
P
Pavel Begunkov 已提交
1522

1523
	if (req->flags & REQ_F_NEED_CLEANUP) {
1524
		const struct io_op_def *def = &io_op_defs[req->opcode];
1525

1526 1527
		if (def->cleanup)
			def->cleanup(req);
P
Pavel Begunkov 已提交
1528
	}
1529 1530 1531 1532 1533
	if ((req->flags & REQ_F_POLLED) && req->apoll) {
		kfree(req->apoll->double_poll);
		kfree(req->apoll);
		req->apoll = NULL;
	}
1534 1535 1536 1537 1538
	if (req->flags & REQ_F_INFLIGHT) {
		struct io_uring_task *tctx = req->task->io_uring;

		atomic_dec(&tctx->inflight_tracked);
	}
1539
	if (req->flags & REQ_F_CREDS)
1540
		put_cred(req->creds);
1541 1542 1543 1544
	if (req->flags & REQ_F_ASYNC_DATA) {
		kfree(req->async_data);
		req->async_data = NULL;
	}
1545
	req->flags &= ~IO_REQ_CLEAN_FLAGS;
P
Pavel Begunkov 已提交
1546 1547
}

J
Jens Axboe 已提交
1548 1549 1550 1551 1552 1553
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
{
	if (req->file || !io_op_defs[req->opcode].needs_file)
		return true;

	if (req->flags & REQ_F_FIXED_FILE)
1554
		req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
J
Jens Axboe 已提交
1555
	else
1556
		req->file = io_file_get_normal(req, req->cqe.fd);
J
Jens Axboe 已提交
1557

1558
	return !!req->file;
J
Jens Axboe 已提交
1559 1560
}

1561
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
J
Jens Axboe 已提交
1562
{
1563
	const struct io_op_def *def = &io_op_defs[req->opcode];
1564
	const struct cred *creds = NULL;
1565
	int ret;
J
Jens Axboe 已提交
1566

1567 1568 1569
	if (unlikely(!io_assign_file(req, issue_flags)))
		return -EBADF;

1570
	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
1571
		creds = override_creds(req->creds);
1572

1573
	if (!def->audit_skip)
1574 1575
		audit_uring_entry(req->opcode);

1576
	ret = def->issue(req, issue_flags);
J
Jens Axboe 已提交
1577

1578
	if (!def->audit_skip)
1579 1580
		audit_uring_exit(!ret, ret);

1581 1582
	if (creds)
		revert_creds(creds);
1583 1584 1585 1586

	if (ret == IOU_OK)
		__io_req_complete(req, issue_flags);
	else if (ret != IOU_ISSUE_SKIP_COMPLETE)
J
Jens Axboe 已提交
1587
		return ret;
1588

1589
	/* If the op doesn't have a file, we're not polling for it */
1590
	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
1591
		io_iopoll_req_issued(req, issue_flags);
J
Jens Axboe 已提交
1592 1593

	return 0;
J
Jens Axboe 已提交
1594 1595
}

1596 1597 1598 1599 1600
int io_poll_issue(struct io_kiocb *req, bool *locked)
{
	io_tw_lock(req->ctx, locked);
	if (unlikely(req->task->flags & PF_EXITING))
		return -EFAULT;
1601
	return io_issue_sqe(req, IO_URING_F_NONBLOCK);
1602 1603
}

1604
struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
P
Pavel Begunkov 已提交
1605 1606 1607 1608 1609 1610 1611
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);

	req = io_put_req_find_next(req);
	return req ? &req->work : NULL;
}

1612
void io_wq_submit_work(struct io_wq_work *work)
J
Jens Axboe 已提交
1613 1614
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
J
Jens Axboe 已提交
1615
	const struct io_op_def *def = &io_op_defs[req->opcode];
1616 1617
	unsigned int issue_flags = IO_URING_F_UNLOCKED;
	bool needs_poll = false;
J
Jens Axboe 已提交
1618
	int ret = 0, err = -ECANCELED;
J
Jens Axboe 已提交
1619

1620 1621 1622 1623 1624
	/* one will be dropped by ->io_free_work() after returning to io-wq */
	if (!(req->flags & REQ_F_REFCOUNT))
		__io_req_set_refcount(req, 2);
	else
		req_ref_get(req);
1625

1626
	io_arm_ltimeout(req);
J
Jens Axboe 已提交
1627

1628
	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
1629
	if (work->flags & IO_WQ_WORK_CANCEL) {
1630
fail:
J
Jens Axboe 已提交
1631
		io_req_task_queue_fail(req, err);
1632 1633
		return;
	}
1634 1635 1636 1637 1638
	if (!io_assign_file(req, issue_flags)) {
		err = -EBADF;
		work->flags |= IO_WQ_WORK_CANCEL;
		goto fail;
	}
1639

1640
	if (req->flags & REQ_F_FORCE_ASYNC) {
1641 1642 1643 1644
		bool opcode_poll = def->pollin || def->pollout;

		if (opcode_poll && file_can_poll(req->file)) {
			needs_poll = true;
1645
			issue_flags |= IO_URING_F_NONBLOCK;
1646
		}
1647
	}
1648

1649 1650 1651 1652 1653 1654 1655 1656 1657 1658
	do {
		ret = io_issue_sqe(req, issue_flags);
		if (ret != -EAGAIN)
			break;
		/*
		 * We can get EAGAIN for iopolled IO even though we're
		 * forcing a sync submission from here, since we can't
		 * wait for request slots on the block side.
		 */
		if (!needs_poll) {
1659 1660
			if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
				break;
1661 1662
			cond_resched();
			continue;
1663 1664
		}

1665
		if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
1666 1667 1668 1669 1670
			return;
		/* aborted or ready, in either case retry blocking */
		needs_poll = false;
		issue_flags &= ~IO_URING_F_NONBLOCK;
	} while (1);
1671

1672
	/* avoid locking problems by failing it from a clean context */
1673
	if (ret < 0)
1674
		io_req_task_queue_fail(req, ret);
J
Jens Axboe 已提交
1675 1676
}

1677 1678
inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
				      unsigned int issue_flags)
J
Jens Axboe 已提交
1679
{
1680 1681
	struct io_ring_ctx *ctx = req->ctx;
	struct file *file = NULL;
1682
	unsigned long file_ptr;
J
Jens Axboe 已提交
1683

1684
	io_ring_submit_lock(ctx, issue_flags);
1685

1686
	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
1687
		goto out;
1688 1689 1690 1691 1692
	fd = array_index_nospec(fd, ctx->nr_user_files);
	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
	file = (struct file *) (file_ptr & FFS_MASK);
	file_ptr &= ~FFS_MASK;
	/* mask in overlapping REQ_F and FFS bits */
1693
	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
1694
	io_req_set_rsrc_node(req, ctx, 0);
1695
	WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
1696
out:
1697
	io_ring_submit_unlock(ctx, issue_flags);
1698 1699
	return file;
}
1700

1701
struct file *io_file_get_normal(struct io_kiocb *req, int fd)
1702
{
1703
	struct file *file = fget(fd);
1704

1705
	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
J
Jens Axboe 已提交
1706

1707
	/* we don't allow fixed io_uring files */
1708
	if (file && io_is_uring_fops(file))
1709
		io_req_track_inflight(req);
P
Pavel Begunkov 已提交
1710
	return file;
J
Jens Axboe 已提交
1711 1712
}

1713
static void io_queue_async(struct io_kiocb *req, int ret)
1714 1715
	__must_hold(&req->ctx->uring_lock)
{
1716 1717 1718 1719 1720 1721 1722 1723
	struct io_kiocb *linked_timeout;

	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
		io_req_complete_failed(req, ret);
		return;
	}

	linked_timeout = io_prep_linked_timeout(req);
1724

1725
	switch (io_arm_poll_handler(req, 0)) {
1726 1727 1728 1729 1730 1731 1732 1733
	case IO_APOLL_READY:
		io_req_task_queue(req);
		break;
	case IO_APOLL_ABORTED:
		/*
		 * Queued up for async execution, worker will release
		 * submit reference when the iocb is actually submitted.
		 */
1734
		io_kbuf_recycle(req, 0);
1735
		io_queue_iowq(req, NULL);
1736
		break;
1737 1738
	case IO_APOLL_OK:
		break;
1739 1740 1741 1742 1743 1744
	}

	if (linked_timeout)
		io_queue_linked_timeout(linked_timeout);
}

P
Pavel Begunkov 已提交
1745
static inline void io_queue_sqe(struct io_kiocb *req)
1746
	__must_hold(&req->ctx->uring_lock)
J
Jens Axboe 已提交
1747
{
1748
	int ret;
J
Jens Axboe 已提交
1749

1750
	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
1751

1752 1753
	if (req->flags & REQ_F_COMPLETE_INLINE) {
		io_req_add_compl_list(req);
1754
		return;
1755
	}
1756 1757 1758 1759
	/*
	 * We async punt it if the file wasn't marked NOWAIT, or if the file
	 * doesn't support non-blocking read/write attempts
	 */
1760
	if (likely(!ret))
1761
		io_arm_ltimeout(req);
1762 1763
	else
		io_queue_async(req, ret);
J
Jens Axboe 已提交
1764 1765
}

1766
static void io_queue_sqe_fallback(struct io_kiocb *req)
1767
	__must_hold(&req->ctx->uring_lock)
1768
{
1769 1770 1771 1772 1773 1774 1775 1776
	if (unlikely(req->flags & REQ_F_FAIL)) {
		/*
		 * We don't submit, fail them all, for that replace hardlinks
		 * with normal links. Extra REQ_F_LINK is tolerated.
		 */
		req->flags &= ~REQ_F_HARDLINK;
		req->flags |= REQ_F_LINK;
		io_req_complete_failed(req, req->cqe.res);
1777 1778
	} else if (unlikely(req->ctx->drain_active)) {
		io_drain_req(req);
1779 1780 1781 1782 1783 1784
	} else {
		int ret = io_req_prep_async(req);

		if (unlikely(ret))
			io_req_complete_failed(req, ret);
		else
1785
			io_queue_iowq(req, NULL);
J
Jens Axboe 已提交
1786
	}
1787 1788
}

1789 1790 1791 1792 1793 1794 1795 1796
/*
 * Check SQE restrictions (opcode and flags).
 *
 * Returns 'true' if SQE is allowed, 'false' otherwise.
 */
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
					struct io_kiocb *req,
					unsigned int sqe_flags)
1797
{
1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
		return false;

	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
	    ctx->restrictions.sqe_flags_required)
		return false;

	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
			  ctx->restrictions.sqe_flags_required))
		return false;

	return true;
1810 1811
}

1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822
static void io_init_req_drain(struct io_kiocb *req)
{
	struct io_ring_ctx *ctx = req->ctx;
	struct io_kiocb *head = ctx->submit_state.link.head;

	ctx->drain_active = true;
	if (head) {
		/*
		 * If we need to drain a request in the middle of a link, drain
		 * the head request and the next request/link after the current
		 * link. Considering sequential execution of links,
1823
		 * REQ_F_IO_DRAIN will be maintained for every request of our
1824 1825
		 * link.
		 */
1826
		head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
1827 1828 1829 1830
		ctx->drain_next = true;
	}
}

1831 1832
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
		       const struct io_uring_sqe *sqe)
1833
	__must_hold(&ctx->uring_lock)
1834
{
1835
	const struct io_op_def *def;
1836
	unsigned int sqe_flags;
1837
	int personality;
1838
	u8 opcode;
1839

P
Pavel Begunkov 已提交
1840
	/* req is partially pre-initialised, see io_preinit_req() */
1841
	req->opcode = opcode = READ_ONCE(sqe->opcode);
1842 1843
	/* same numerical values with corresponding REQ_F_*, safe to copy */
	req->flags = sqe_flags = READ_ONCE(sqe->flags);
1844
	req->cqe.user_data = READ_ONCE(sqe->user_data);
1845
	req->file = NULL;
1846
	req->rsrc_node = NULL;
1847 1848
	req->task = current;

1849 1850
	if (unlikely(opcode >= IORING_OP_LAST)) {
		req->opcode = 0;
1851
		return -EINVAL;
1852
	}
1853
	def = &io_op_defs[opcode];
1854 1855 1856 1857
	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
		/* enforce forwards compatibility on users */
		if (sqe_flags & ~SQE_VALID_FLAGS)
			return -EINVAL;
1858
		if (sqe_flags & IOSQE_BUFFER_SELECT) {
1859
			if (!def->buffer_select)
1860 1861 1862
				return -EOPNOTSUPP;
			req->buf_index = READ_ONCE(sqe->buf_group);
		}
1863 1864 1865 1866 1867
		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
			ctx->drain_disabled = true;
		if (sqe_flags & IOSQE_IO_DRAIN) {
			if (ctx->drain_disabled)
				return -EOPNOTSUPP;
1868
			io_init_req_drain(req);
1869
		}
1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880
	}
	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
			return -EACCES;
		/* knock it to the slow queue path, will be drained there */
		if (ctx->drain_active)
			req->flags |= REQ_F_FORCE_ASYNC;
		/* if there is no link, we're at "next" request and need to drain */
		if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
			ctx->drain_next = false;
			ctx->drain_active = true;
1881
			req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
1882
		}
1883
	}
1884

1885
	if (!def->ioprio && sqe->ioprio)
1886
		return -EINVAL;
1887
	if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
1888 1889
		return -EINVAL;

1890
	if (def->needs_file) {
P
Pavel Begunkov 已提交
1891 1892
		struct io_submit_state *state = &ctx->submit_state;

1893
		req->cqe.fd = READ_ONCE(sqe->fd);
J
Jens Axboe 已提交
1894

P
Pavel Begunkov 已提交
1895 1896 1897 1898
		/*
		 * Plug now if we have more than 2 IO left after this, and the
		 * target is potentially a read/write to block based storage.
		 */
1899
		if (state->need_plug && def->plug) {
P
Pavel Begunkov 已提交
1900 1901
			state->plug_started = true;
			state->need_plug = false;
1902
			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
P
Pavel Begunkov 已提交
1903
		}
1904
	}
1905

1906 1907
	personality = READ_ONCE(sqe->personality);
	if (personality) {
1908 1909
		int ret;

1910 1911
		req->creds = xa_load(&ctx->personalities, personality);
		if (!req->creds)
1912
			return -EINVAL;
1913
		get_cred(req->creds);
1914 1915 1916 1917 1918
		ret = security_uring_override_creds(req->creds);
		if (ret) {
			put_cred(req->creds);
			return ret;
		}
1919
		req->flags |= REQ_F_CREDS;
1920
	}
1921

1922
	return def->prep(req, sqe);
1923 1924
}

1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
				      struct io_kiocb *req, int ret)
{
	struct io_ring_ctx *ctx = req->ctx;
	struct io_submit_link *link = &ctx->submit_state.link;
	struct io_kiocb *head = link->head;

	trace_io_uring_req_failed(sqe, ctx, req, ret);

	/*
	 * Avoid breaking links in the middle as it renders links with SQPOLL
	 * unusable. Instead of failing eagerly, continue assembling the link if
	 * applicable and mark the head with REQ_F_FAIL. The link flushing code
	 * should find the flag and handle the rest.
	 */
	req_fail_link_node(req, ret);
	if (head && !(head->flags & REQ_F_FAIL))
		req_fail_link_node(head, -ECANCELED);

	if (!(req->flags & IO_REQ_LINK_FLAGS)) {
		if (head) {
			link->last->link = req;
			link->head = NULL;
			req = head;
		}
		io_queue_sqe_fallback(req);
		return ret;
	}

	if (head)
		link->last->link = req;
	else
		link->head = req;
	link->last = req;
	return 0;
}

static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1963
			 const struct io_uring_sqe *sqe)
1964
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
1965
{
1966
	struct io_submit_link *link = &ctx->submit_state.link;
1967
	int ret;
J
Jens Axboe 已提交
1968

1969
	ret = io_init_req(ctx, req, sqe);
1970 1971
	if (unlikely(ret))
		return io_submit_fail_init(sqe, req, ret);
1972

1973
	/* don't need @sqe from now on */
1974
	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
1975 1976
				  req->flags, true,
				  ctx->flags & IORING_SETUP_SQPOLL);
1977

J
Jens Axboe 已提交
1978 1979 1980 1981 1982 1983 1984
	/*
	 * If we already have a head request, queue this one for async
	 * submittal once the head completes. If we don't have a head but
	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
	 * submitted sync once the chain is complete. If none of those
	 * conditions are true (normal request), then just queue it.
	 */
1985
	if (unlikely(link->head)) {
1986 1987 1988 1989 1990
		ret = io_req_prep_async(req);
		if (unlikely(ret))
			return io_submit_fail_init(sqe, req, ret);

		trace_io_uring_link(ctx, req, link->head);
1991
		link->last->link = req;
1992
		link->last = req;
1993

1994
		if (req->flags & IO_REQ_LINK_FLAGS)
1995
			return 0;
1996 1997
		/* last request of the link, flush it */
		req = link->head;
1998
		link->head = NULL;
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
		if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
			goto fallback;

	} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
					  REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
		if (req->flags & IO_REQ_LINK_FLAGS) {
			link->head = req;
			link->last = req;
		} else {
fallback:
			io_queue_sqe_fallback(req);
		}
2011
		return 0;
J
Jens Axboe 已提交
2012
	}
2013

2014
	io_queue_sqe(req);
2015
	return 0;
J
Jens Axboe 已提交
2016 2017
}

2018 2019 2020
/*
 * Batched submission is done, ensure local IO is flushed out.
 */
2021
static void io_submit_state_end(struct io_ring_ctx *ctx)
2022
{
2023 2024
	struct io_submit_state *state = &ctx->submit_state;

2025 2026
	if (unlikely(state->link.head))
		io_queue_sqe_fallback(state->link.head);
2027
	/* flush only after queuing links as they can generate completions */
2028
	io_submit_flush_completions(ctx);
J
Jens Axboe 已提交
2029 2030
	if (state->plug_started)
		blk_finish_plug(&state->plug);
2031 2032 2033 2034 2035 2036
}

/*
 * Start submission side cache.
 */
static void io_submit_state_start(struct io_submit_state *state,
2037
				  unsigned int max_ios)
2038
{
J
Jens Axboe 已提交
2039
	state->plug_started = false;
P
Pavel Begunkov 已提交
2040
	state->need_plug = max_ios > 2;
2041
	state->submit_nr = max_ios;
2042 2043
	/* set only head, no need to init link_last in advance */
	state->link.head = NULL;
2044 2045
}

J
Jens Axboe 已提交
2046 2047
static void io_commit_sqring(struct io_ring_ctx *ctx)
{
2048
	struct io_rings *rings = ctx->rings;
J
Jens Axboe 已提交
2049

2050 2051 2052 2053 2054 2055
	/*
	 * Ensure any loads from the SQEs are done at this point,
	 * since once we write the new head, the application could
	 * write new data to them.
	 */
	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
J
Jens Axboe 已提交
2056 2057 2058
}

/*
F
Fam Zheng 已提交
2059
 * Fetch an sqe, if one is available. Note this returns a pointer to memory
J
Jens Axboe 已提交
2060 2061 2062 2063 2064 2065
 * that is mapped by userspace. This means that care needs to be taken to
 * ensure that reads are stable, as we cannot rely on userspace always
 * being a good citizen. If members of the sqe are validated and then later
 * used, it's important that those reads are done through READ_ONCE() to
 * prevent a re-load down the line.
 */
2066
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2067
{
2068
	unsigned head, mask = ctx->sq_entries - 1;
P
Pavel Begunkov 已提交
2069
	unsigned sq_idx = ctx->cached_sq_head++ & mask;
J
Jens Axboe 已提交
2070 2071 2072 2073 2074 2075 2076 2077 2078

	/*
	 * The cached sq head (or cq tail) serves two purposes:
	 *
	 * 1) allows us to batch the cost of updating the user visible
	 *    head updates.
	 * 2) allows the kernel side to track the head on its own, even
	 *    though the application is the one updating it.
	 */
P
Pavel Begunkov 已提交
2079
	head = READ_ONCE(ctx->sq_array[sq_idx]);
2080 2081 2082 2083
	if (likely(head < ctx->sq_entries)) {
		/* double index for 128-byte SQEs, twice as long */
		if (ctx->flags & IORING_SETUP_SQE128)
			head <<= 1;
2084
		return &ctx->sq_sqes[head];
2085
	}
J
Jens Axboe 已提交
2086 2087

	/* drop invalid entries */
2088 2089 2090
	ctx->cq_extra--;
	WRITE_ONCE(ctx->rings->sq_dropped,
		   READ_ONCE(ctx->rings->sq_dropped) + 1);
2091 2092 2093
	return NULL;
}

2094
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
2095
	__must_hold(&ctx->uring_lock)
J
Jens Axboe 已提交
2096
{
2097
	unsigned int entries = io_sqring_entries(ctx);
2098 2099
	unsigned int left;
	int ret;
J
Jens Axboe 已提交
2100

2101
	if (unlikely(!entries))
2102
		return 0;
2103
	/* make sure SQ entry isn't read before tail */
2104 2105 2106
	ret = left = min3(nr, ctx->sq_entries, entries);
	io_get_task_refs(left);
	io_submit_state_start(&ctx->submit_state, left);
J
Jens Axboe 已提交
2107

2108
	do {
2109
		const struct io_uring_sqe *sqe;
2110
		struct io_kiocb *req;
2111

2112
		if (unlikely(!io_alloc_req_refill(ctx)))
2113
			break;
2114
		req = io_alloc_req(ctx);
2115 2116
		sqe = io_get_sqe(ctx);
		if (unlikely(!sqe)) {
2117
			io_req_add_to_cache(req, ctx);
2118 2119
			break;
		}
J
Jens Axboe 已提交
2120

2121 2122 2123 2124 2125 2126 2127 2128
		/*
		 * Continue submitting even for sqe failure if the
		 * ring was setup with IORING_SETUP_SUBMIT_ALL
		 */
		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
			left--;
			break;
2129
		}
2130
	} while (--left);
2131

2132 2133 2134 2135 2136 2137
	if (unlikely(left)) {
		ret -= left;
		/* try again if it submitted nothing and can't allocate a req */
		if (!ret && io_req_cache_empty(ctx))
			ret = -EAGAIN;
		current->io_uring->cached_refs += left;
2138
	}
J
Jens Axboe 已提交
2139

2140
	io_submit_state_end(ctx);
2141 2142
	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
	io_commit_sqring(ctx);
2143
	return ret;
J
Jens Axboe 已提交
2144 2145
}

2146 2147 2148
struct io_wait_queue {
	struct wait_queue_entry wq;
	struct io_ring_ctx *ctx;
2149
	unsigned cq_tail;
2150 2151 2152
	unsigned nr_timeouts;
};

2153
static inline bool io_should_wake(struct io_wait_queue *iowq)
2154 2155
{
	struct io_ring_ctx *ctx = iowq->ctx;
2156
	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
2157 2158

	/*
2159
	 * Wake up if we have enough events, or if a timeout occurred since we
2160 2161 2162
	 * started waiting. For timeouts, we always want to return to userspace,
	 * regardless of event count.
	 */
2163
	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
2164 2165 2166 2167 2168 2169 2170 2171
}

static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
			    int wake_flags, void *key)
{
	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
							wq);

2172 2173 2174 2175
	/*
	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
	 * the task, and the next invocation will do it.
	 */
2176 2177
	if (io_should_wake(iowq) ||
	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
2178 2179
		return autoremove_wake_function(curr, mode, wake_flags, key);
	return -1;
2180 2181
}

2182
int io_run_task_work_sig(void)
2183 2184 2185
{
	if (io_run_task_work())
		return 1;
2186
	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
2187
		return -ERESTARTSYS;
2188 2189 2190
	if (task_sigpending(current))
		return -EINTR;
	return 0;
2191 2192
}

2193 2194 2195
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
					  struct io_wait_queue *iowq,
2196
					  ktime_t timeout)
2197 2198
{
	int ret;
2199
	unsigned long check_cq;
2200 2201 2202 2203 2204

	/* make sure we run task_work before checking for signals */
	ret = io_run_task_work_sig();
	if (ret || io_should_wake(iowq))
		return ret;
2205

2206
	check_cq = READ_ONCE(ctx->check_cq);
2207 2208 2209 2210 2211 2212 2213
	if (unlikely(check_cq)) {
		/* let the caller flush overflows, retry */
		if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
			return 1;
		if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
			return -EBADR;
	}
2214 2215 2216
	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
		return -ETIME;
	return 1;
2217 2218
}

J
Jens Axboe 已提交
2219 2220 2221 2222 2223
/*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
 */
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2224 2225
			  const sigset_t __user *sig, size_t sigsz,
			  struct __kernel_timespec __user *uts)
J
Jens Axboe 已提交
2226
{
2227
	struct io_wait_queue iowq;
2228
	struct io_rings *rings = ctx->rings;
2229
	ktime_t timeout = KTIME_MAX;
2230
	int ret;
J
Jens Axboe 已提交
2231

2232
	do {
2233
		io_cqring_overflow_flush(ctx);
2234
		if (io_cqring_events(ctx) >= min_events)
2235
			return 0;
2236
		if (!io_run_task_work())
2237 2238
			break;
	} while (1);
J
Jens Axboe 已提交
2239 2240

	if (sig) {
2241 2242 2243
#ifdef CONFIG_COMPAT
		if (in_compat_syscall())
			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
2244
						      sigsz);
2245 2246
		else
#endif
2247
			ret = set_user_sigmask(sig, sigsz);
2248

J
Jens Axboe 已提交
2249 2250 2251 2252
		if (ret)
			return ret;
	}

2253 2254 2255 2256 2257 2258 2259 2260
	if (uts) {
		struct timespec64 ts;

		if (get_timespec64(&ts, uts))
			return -EFAULT;
		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
	}

2261 2262 2263 2264
	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
	iowq.wq.private = current;
	INIT_LIST_HEAD(&iowq.wq.entry);
	iowq.ctx = ctx;
2265
	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
2266
	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
2267

2268
	trace_io_uring_cqring_wait(ctx, min_events);
2269
	do {
2270
		/* if we can't even flush overflow, don't wait for more */
2271
		if (!io_cqring_overflow_flush(ctx)) {
2272 2273 2274
			ret = -EBUSY;
			break;
		}
P
Pavel Begunkov 已提交
2275
		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
2276
						TASK_INTERRUPTIBLE);
2277
		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
2278
		cond_resched();
2279
	} while (ret > 0);
2280

2281
	finish_wait(&ctx->cq_wait, &iowq.wq);
2282
	restore_saved_sigmask_unless(ret == -EINTR);
J
Jens Axboe 已提交
2283

2284
	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
J
Jens Axboe 已提交
2285 2286
}

2287
static void io_mem_free(void *ptr)
2288
{
2289
	struct page *page;
2290

2291 2292
	if (!ptr)
		return;
2293

2294 2295 2296
	page = virt_to_head_page(ptr);
	if (put_page_testzero(page))
		free_compound_page(page);
2297 2298
}

2299
static void *io_mem_alloc(size_t size)
2300
{
2301
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
2302

2303
	return (void *) __get_free_pages(gfp, get_order(size));
2304 2305
}

2306 2307
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
				unsigned int cq_entries, size_t *sq_offset)
J
Jens Axboe 已提交
2308
{
2309 2310
	struct io_rings *rings;
	size_t off, sq_array_size;
J
Jens Axboe 已提交
2311

2312 2313 2314 2315 2316 2317 2318
	off = struct_size(rings, cqes, cq_entries);
	if (off == SIZE_MAX)
		return SIZE_MAX;
	if (ctx->flags & IORING_SETUP_CQE32) {
		if (check_shl_overflow(off, 1, &off))
			return SIZE_MAX;
	}
2319

2320 2321 2322 2323 2324
#ifdef CONFIG_SMP
	off = ALIGN(off, SMP_CACHE_BYTES);
	if (off == 0)
		return SIZE_MAX;
#endif
2325

2326 2327
	if (sq_offset)
		*sq_offset = off;
2328

2329 2330 2331
	sq_array_size = array_size(sizeof(u32), sq_entries);
	if (sq_array_size == SIZE_MAX)
		return SIZE_MAX;
J
Jens Axboe 已提交
2332

2333 2334
	if (check_add_overflow(off, sq_array_size, &off))
		return SIZE_MAX;
2335

2336
	return off;
2337 2338
}

2339 2340
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
			       unsigned int eventfd_async)
2341
{
2342 2343 2344
	struct io_ev_fd *ev_fd;
	__s32 __user *fds = arg;
	int fd;
2345

2346 2347 2348 2349
	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
					lockdep_is_held(&ctx->uring_lock));
	if (ev_fd)
		return -EBUSY;
2350

2351 2352
	if (copy_from_user(&fd, fds, sizeof(*fds)))
		return -EFAULT;
2353

2354 2355 2356
	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
	if (!ev_fd)
		return -ENOMEM;
2357

2358 2359 2360 2361 2362 2363 2364 2365 2366 2367
	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
	if (IS_ERR(ev_fd->cq_ev_fd)) {
		int ret = PTR_ERR(ev_fd->cq_ev_fd);
		kfree(ev_fd);
		return ret;
	}
	ev_fd->eventfd_async = eventfd_async;
	ctx->has_evfd = true;
	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
	return 0;
2368 2369
}

2370
static void io_eventfd_put(struct rcu_head *rcu)
2371
{
2372
	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
2373

2374 2375
	eventfd_ctx_put(ev_fd->cq_ev_fd);
	kfree(ev_fd);
2376 2377
}

2378
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
2379
{
2380 2381 2382 2383 2384 2385 2386 2387 2388 2389
	struct io_ev_fd *ev_fd;

	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
					lockdep_is_held(&ctx->uring_lock));
	if (ev_fd) {
		ctx->has_evfd = false;
		rcu_assign_pointer(ctx->io_ev_fd, NULL);
		call_rcu(&ev_fd->rcu, io_eventfd_put);
		return 0;
	}
2390

2391
	return -ENXIO;
2392 2393
}

2394
static void io_req_caches_free(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2395
{
2396
	struct io_submit_state *state = &ctx->submit_state;
2397
	int nr = 0;
P
Pavel Begunkov 已提交
2398

2399
	mutex_lock(&ctx->uring_lock);
2400
	io_flush_cached_locked_reqs(ctx, state);
2401

2402
	while (!io_req_cache_empty(ctx)) {
2403 2404
		struct io_wq_work_node *node;
		struct io_kiocb *req;
2405

2406 2407 2408
		node = wq_stack_extract(&state->free_list);
		req = container_of(node, struct io_kiocb, comp_list);
		kmem_cache_free(req_cachep, req);
2409
		nr++;
2410
	}
2411 2412
	if (nr)
		percpu_ref_put_many(&ctx->refs, nr);
2413 2414 2415
	mutex_unlock(&ctx->uring_lock);
}

2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427
static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
{
	struct async_poll *apoll;

	while (!list_empty(&ctx->apoll_cache)) {
		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
						poll.wait.entry);
		list_del(&apoll->poll.wait.entry);
		kfree(apoll);
	}
}

P
Pavel Begunkov 已提交
2428
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2429
{
2430
	io_sq_thread_finish(ctx);
2431

2432
	if (ctx->mm_account) {
2433 2434
		mmdrop(ctx->mm_account);
		ctx->mm_account = NULL;
2435
	}
J
Jens Axboe 已提交
2436

2437
	io_rsrc_refs_drop(ctx);
2438 2439 2440 2441
	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
	io_wait_rsrc_data(ctx->buf_data);
	io_wait_rsrc_data(ctx->file_data);

2442
	mutex_lock(&ctx->uring_lock);
2443
	if (ctx->buf_data)
2444
		__io_sqe_buffers_unregister(ctx);
2445
	if (ctx->file_data)
2446
		__io_sqe_files_unregister(ctx);
2447 2448
	if (ctx->rings)
		__io_cqring_overflow_flush(ctx, true);
2449
	io_eventfd_unregister(ctx);
2450
	io_flush_apoll_cache(ctx);
2451
	mutex_unlock(&ctx->uring_lock);
2452
	io_destroy_buffers(ctx);
2453 2454
	if (ctx->sq_creds)
		put_cred(ctx->sq_creds);
J
Jens Axboe 已提交
2455

P
Pavel Begunkov 已提交
2456 2457 2458
	/* there are no registered resources left, nobody uses it */
	if (ctx->rsrc_node)
		io_rsrc_node_destroy(ctx->rsrc_node);
2459
	if (ctx->rsrc_backup_node)
2460
		io_rsrc_node_destroy(ctx->rsrc_backup_node);
P
Pavel Begunkov 已提交
2461
	flush_delayed_work(&ctx->rsrc_put_work);
2462
	flush_delayed_work(&ctx->fallback_work);
P
Pavel Begunkov 已提交
2463 2464 2465

	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
J
Jens Axboe 已提交
2466

J
Jens Axboe 已提交
2467
#if defined(CONFIG_UNIX)
2468 2469
	if (ctx->ring_sock) {
		ctx->ring_sock->file = NULL; /* so that iput() is called */
J
Jens Axboe 已提交
2470
		sock_release(ctx->ring_sock);
2471
	}
J
Jens Axboe 已提交
2472
#endif
2473
	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
J
Jens Axboe 已提交
2474

2475
	io_mem_free(ctx->rings);
J
Jens Axboe 已提交
2476 2477 2478 2479
	io_mem_free(ctx->sq_sqes);

	percpu_ref_exit(&ctx->refs);
	free_uid(ctx->user);
2480
	io_req_caches_free(ctx);
2481 2482
	if (ctx->hash_map)
		io_wq_put_hash(ctx->hash_map);
2483
	kfree(ctx->cancel_hash);
2484
	kfree(ctx->dummy_ubuf);
2485 2486
	kfree(ctx->io_bl);
	xa_destroy(&ctx->io_bl_xa);
J
Jens Axboe 已提交
2487 2488 2489 2490 2491 2492 2493 2494
	kfree(ctx);
}

static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
	struct io_ring_ctx *ctx = file->private_data;
	__poll_t mask = 0;

2495
	poll_wait(file, &ctx->cq_wait, wait);
2496 2497 2498 2499
	/*
	 * synchronizes with barrier from wq_has_sleeper call in
	 * io_commit_cqring
	 */
J
Jens Axboe 已提交
2500
	smp_rmb();
2501
	if (!io_sqring_full(ctx))
J
Jens Axboe 已提交
2502
		mask |= EPOLLOUT | EPOLLWRNORM;
2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516

	/*
	 * Don't flush cqring overflow list here, just do a simple check.
	 * Otherwise there could possible be ABBA deadlock:
	 *      CPU0                    CPU1
	 *      ----                    ----
	 * lock(&ctx->uring_lock);
	 *                              lock(&ep->mtx);
	 *                              lock(&ctx->uring_lock);
	 * lock(&ep->mtx);
	 *
	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
	 * pushs them to do the flush.
	 */
2517 2518
	if (io_cqring_events(ctx) ||
	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
J
Jens Axboe 已提交
2519 2520 2521 2522 2523
		mask |= EPOLLIN | EPOLLRDNORM;

	return mask;
}

2524
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
2525
{
J
Jens Axboe 已提交
2526
	const struct cred *creds;
2527

2528
	creds = xa_erase(&ctx->personalities, id);
J
Jens Axboe 已提交
2529 2530
	if (creds) {
		put_cred(creds);
2531
		return 0;
J
Jens Axboe 已提交
2532
	}
2533 2534 2535 2536

	return -EINVAL;
}

2537 2538 2539
struct io_tctx_exit {
	struct callback_head		task_work;
	struct completion		completion;
2540
	struct io_ring_ctx		*ctx;
2541 2542
};

P
Pavel Begunkov 已提交
2543
static __cold void io_tctx_exit_cb(struct callback_head *cb)
2544 2545 2546 2547 2548 2549 2550 2551 2552 2553
{
	struct io_uring_task *tctx = current->io_uring;
	struct io_tctx_exit *work;

	work = container_of(cb, struct io_tctx_exit, task_work);
	/*
	 * When @in_idle, we're in cancellation and it's racy to remove the
	 * node. It'll be removed by the end of cancellation, just ignore it.
	 */
	if (!atomic_read(&tctx->in_idle))
2554
		io_uring_del_tctx_node((unsigned long)work->ctx);
2555 2556 2557
	complete(&work->completion);
}

P
Pavel Begunkov 已提交
2558
static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
2559 2560 2561 2562 2563 2564
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);

	return req->ctx == data;
}

P
Pavel Begunkov 已提交
2565
static __cold void io_ring_exit_work(struct work_struct *work)
2566
{
2567
	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
2568
	unsigned long timeout = jiffies + HZ * 60 * 5;
2569
	unsigned long interval = HZ / 20;
2570 2571 2572
	struct io_tctx_exit exit;
	struct io_tctx_node *node;
	int ret;
2573

2574 2575 2576 2577 2578 2579
	/*
	 * If we're doing polled IO and end up having requests being
	 * submitted async (out-of-line), then completions can come in while
	 * we're waiting for refs to drop. We need to reap these manually,
	 * as nobody else will be looking for them.
	 */
2580
	do {
2581
		io_uring_try_cancel_requests(ctx, NULL, true);
2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592
		if (ctx->sq_data) {
			struct io_sq_data *sqd = ctx->sq_data;
			struct task_struct *tsk;

			io_sq_thread_park(sqd);
			tsk = sqd->thread;
			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
				io_wq_cancel_cb(tsk->io_uring->io_wq,
						io_cancel_ctx_cb, ctx, true);
			io_sq_thread_unpark(sqd);
		}
2593

2594 2595
		io_req_caches_free(ctx);

2596 2597 2598 2599 2600
		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
			/* there is little hope left, don't run it too often */
			interval = HZ * 60;
		}
	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
2601

2602 2603 2604
	init_completion(&exit.completion);
	init_task_work(&exit.task_work, io_tctx_exit_cb);
	exit.ctx = ctx;
2605 2606 2607
	/*
	 * Some may use context even when all refs and requests have been put,
	 * and they are free to do so while still holding uring_lock or
2608
	 * completion_lock, see io_req_task_submit(). Apart from other work,
2609 2610
	 * this lock/unlock section also waits them to finish.
	 */
2611 2612
	mutex_lock(&ctx->uring_lock);
	while (!list_empty(&ctx->tctx_list)) {
2613 2614
		WARN_ON_ONCE(time_after(jiffies, timeout));

2615 2616
		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
					ctx_node);
2617 2618
		/* don't spin on a single task if cancellation failed */
		list_rotate_left(&ctx->tctx_list);
2619 2620 2621 2622 2623 2624 2625 2626 2627
		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
		if (WARN_ON_ONCE(ret))
			continue;

		mutex_unlock(&ctx->uring_lock);
		wait_for_completion(&exit.completion);
		mutex_lock(&ctx->uring_lock);
	}
	mutex_unlock(&ctx->uring_lock);
2628 2629
	spin_lock(&ctx->completion_lock);
	spin_unlock(&ctx->completion_lock);
2630

2631 2632 2633
	io_ring_ctx_free(ctx);
}

P
Pavel Begunkov 已提交
2634
static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
2635
{
2636 2637 2638
	unsigned long index;
	struct creds *creds;

J
Jens Axboe 已提交
2639 2640
	mutex_lock(&ctx->uring_lock);
	percpu_ref_kill(&ctx->refs);
2641
	if (ctx->rings)
2642
		__io_cqring_overflow_flush(ctx, true);
2643 2644
	xa_for_each(&ctx->personalities, index, creds)
		io_unregister_personality(ctx, index);
J
Jens Axboe 已提交
2645 2646
	mutex_unlock(&ctx->uring_lock);

2647 2648 2649 2650 2651 2652 2653
	/* failed during ring init, it couldn't have issued any requests */
	if (ctx->rings) {
		io_kill_timeouts(ctx, NULL, true);
		io_poll_remove_all(ctx, NULL, true);
		/* if we failed setting up the ctx, we might not have any rings */
		io_iopoll_try_reap_events(ctx);
	}
2654

2655
	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
2656 2657 2658 2659 2660 2661 2662
	/*
	 * Use system_unbound_wq to avoid spawning tons of event kworkers
	 * if we're exiting a ton of rings at the same time. It just adds
	 * noise and overhead, there's no discernable change in runtime
	 * over using system_wq.
	 */
	queue_work(system_unbound_wq, &ctx->exit_work);
J
Jens Axboe 已提交
2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673
}

static int io_uring_release(struct inode *inode, struct file *file)
{
	struct io_ring_ctx *ctx = file->private_data;

	file->private_data = NULL;
	io_ring_ctx_wait_and_kill(ctx);
	return 0;
}

2674 2675
struct io_task_cancel {
	struct task_struct *task;
2676
	bool all;
2677
};
2678

2679
static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
2680
{
2681
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2682
	struct io_task_cancel *cancel = data;
2683

2684
	return io_match_task_safe(req, cancel->task, cancel->all);
2685 2686
}

P
Pavel Begunkov 已提交
2687 2688 2689
static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
					 struct task_struct *task,
					 bool cancel_all)
2690
{
2691
	struct io_defer_entry *de;
2692 2693
	LIST_HEAD(list);

2694
	spin_lock(&ctx->completion_lock);
2695
	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
2696
		if (io_match_task_safe(de->req, task, cancel_all)) {
2697 2698 2699 2700
			list_cut_position(&list, &ctx->defer_list, &de->list);
			break;
		}
	}
2701
	spin_unlock(&ctx->completion_lock);
2702 2703
	if (list_empty(&list))
		return false;
2704 2705 2706 2707

	while (!list_empty(&list)) {
		de = list_first_entry(&list, struct io_defer_entry, list);
		list_del_init(&de->list);
2708
		io_req_complete_failed(de->req, -ECANCELED);
2709 2710
		kfree(de);
	}
2711
	return true;
2712 2713
}

P
Pavel Begunkov 已提交
2714
static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737
{
	struct io_tctx_node *node;
	enum io_wq_cancel cret;
	bool ret = false;

	mutex_lock(&ctx->uring_lock);
	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
		struct io_uring_task *tctx = node->task->io_uring;

		/*
		 * io_wq will stay alive while we hold uring_lock, because it's
		 * killed after ctx nodes, which requires to take the lock.
		 */
		if (!tctx || !tctx->io_wq)
			continue;
		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
	}
	mutex_unlock(&ctx->uring_lock);

	return ret;
}

P
Pavel Begunkov 已提交
2738 2739 2740
static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
						struct task_struct *task,
						bool cancel_all)
2741
{
2742
	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
2743
	struct io_uring_task *tctx = task ? task->io_uring : NULL;
2744

2745 2746 2747 2748
	/* failed during ring init, it couldn't have issued any requests */
	if (!ctx->rings)
		return;

2749 2750 2751 2752
	while (1) {
		enum io_wq_cancel cret;
		bool ret = false;

2753 2754 2755 2756 2757 2758 2759
		if (!task) {
			ret |= io_uring_try_cancel_iowq(ctx);
		} else if (tctx && tctx->io_wq) {
			/*
			 * Cancels requests of all rings, not only @ctx, but
			 * it's fine as the task is in exit/exec.
			 */
2760
			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
2761 2762 2763 2764 2765
					       &cancel, true);
			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
		}

		/* SQPOLL thread does its own polling */
2766
		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
2767
		    (ctx->sq_data && ctx->sq_data->thread == current)) {
2768
			while (!wq_list_empty(&ctx->iopoll_list)) {
2769 2770 2771 2772 2773
				io_iopoll_try_reap_events(ctx);
				ret = true;
			}
		}

2774 2775 2776
		ret |= io_cancel_defer_files(ctx, task, cancel_all);
		ret |= io_poll_remove_all(ctx, task, cancel_all);
		ret |= io_kill_timeouts(ctx, task, cancel_all);
2777 2778
		if (task)
			ret |= io_run_task_work();
2779 2780 2781 2782 2783 2784
		if (!ret)
			break;
		cond_resched();
	}
}

2785
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
2786
{
2787
	if (tracked)
2788
		return atomic_read(&tctx->inflight_tracked);
2789 2790 2791
	return percpu_counter_sum(&tctx->inflight);
}

2792 2793
/*
 * Find any io_uring ctx that this task has registered or done IO on, and cancel
2794
 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
2795
 */
2796
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
2797
{
2798
	struct io_uring_task *tctx = current->io_uring;
2799
	struct io_ring_ctx *ctx;
2800 2801
	s64 inflight;
	DEFINE_WAIT(wait);
2802

2803 2804
	WARN_ON_ONCE(sqd && sqd->thread != current);

2805 2806
	if (!current->io_uring)
		return;
2807 2808 2809
	if (tctx->io_wq)
		io_wq_exit_start(tctx->io_wq);

2810 2811
	atomic_inc(&tctx->in_idle);
	do {
2812
		io_uring_drop_tctx_refs(current);
2813
		/* read completions before cancelations */
2814
		inflight = tctx_inflight(tctx, !cancel_all);
2815 2816
		if (!inflight)
			break;
2817

2818 2819 2820
		if (!sqd) {
			struct io_tctx_node *node;
			unsigned long index;
2821

2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833
			xa_for_each(&tctx->xa, index, node) {
				/* sqpoll task will cancel all its requests */
				if (node->ctx->sq_data)
					continue;
				io_uring_try_cancel_requests(node->ctx, current,
							     cancel_all);
			}
		} else {
			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
				io_uring_try_cancel_requests(ctx, current,
							     cancel_all);
		}
2834

2835 2836
		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
		io_run_task_work();
2837
		io_uring_drop_tctx_refs(current);
2838

2839
		/*
2840 2841 2842
		 * If we've seen completions, retry without waiting. This
		 * avoids a race where a completion comes in before we did
		 * prepare_to_wait().
2843
		 */
2844
		if (inflight == tctx_inflight(tctx, !cancel_all))
2845
			schedule();
2846
		finish_wait(&tctx->wait, &wait);
2847
	} while (1);
2848

P
Pavel Begunkov 已提交
2849
	io_uring_clean_tctx(tctx);
2850
	if (cancel_all) {
2851 2852 2853 2854 2855
		/*
		 * We shouldn't run task_works after cancel, so just leave
		 * ->in_idle set for normal exit.
		 */
		atomic_dec(&tctx->in_idle);
2856 2857 2858
		/* for exec all current's requests should be gone, kill tctx */
		__io_uring_free(current);
	}
2859 2860
}

2861
void __io_uring_cancel(bool cancel_all)
2862
{
2863
	io_uring_cancel_generic(cancel_all, NULL);
2864 2865
}

2866 2867
static void *io_uring_validate_mmap_request(struct file *file,
					    loff_t pgoff, size_t sz)
J
Jens Axboe 已提交
2868 2869
{
	struct io_ring_ctx *ctx = file->private_data;
2870
	loff_t offset = pgoff << PAGE_SHIFT;
J
Jens Axboe 已提交
2871 2872 2873 2874 2875
	struct page *page;
	void *ptr;

	switch (offset) {
	case IORING_OFF_SQ_RING:
2876 2877
	case IORING_OFF_CQ_RING:
		ptr = ctx->rings;
J
Jens Axboe 已提交
2878 2879 2880 2881 2882
		break;
	case IORING_OFF_SQES:
		ptr = ctx->sq_sqes;
		break;
	default:
2883
		return ERR_PTR(-EINVAL);
J
Jens Axboe 已提交
2884 2885 2886
	}

	page = virt_to_head_page(ptr);
2887
	if (sz > page_size(page))
2888 2889 2890 2891 2892 2893 2894
		return ERR_PTR(-EINVAL);

	return ptr;
}

#ifdef CONFIG_MMU

P
Pavel Begunkov 已提交
2895
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2896 2897 2898 2899 2900 2901 2902 2903
{
	size_t sz = vma->vm_end - vma->vm_start;
	unsigned long pfn;
	void *ptr;

	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
	if (IS_ERR(ptr))
		return PTR_ERR(ptr);
J
Jens Axboe 已提交
2904 2905 2906 2907 2908

	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935
#else /* !CONFIG_MMU */

static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
}

static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
{
	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
}

static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
	unsigned long addr, unsigned long len,
	unsigned long pgoff, unsigned long flags)
{
	void *ptr;

	ptr = io_uring_validate_mmap_request(file, pgoff, len);
	if (IS_ERR(ptr))
		return PTR_ERR(ptr);

	return (unsigned long) ptr;
}

#endif /* !CONFIG_MMU */

2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948
static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{
	if (flags & IORING_ENTER_EXT_ARG) {
		struct io_uring_getevents_arg arg;

		if (argsz != sizeof(arg))
			return -EINVAL;
		if (copy_from_user(&arg, argp, sizeof(arg)))
			return -EFAULT;
	}
	return 0;
}

2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
			  struct __kernel_timespec __user **ts,
			  const sigset_t __user **sig)
{
	struct io_uring_getevents_arg arg;

	/*
	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
	 * is just a pointer to the sigset_t.
	 */
	if (!(flags & IORING_ENTER_EXT_ARG)) {
		*sig = (const sigset_t __user *) argp;
		*ts = NULL;
		return 0;
	}

	/*
	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
	 * timespec and sigset_t pointers if good.
	 */
	if (*argsz != sizeof(arg))
		return -EINVAL;
	if (copy_from_user(&arg, argp, sizeof(arg)))
		return -EFAULT;
2973 2974
	if (arg.pad)
		return -EINVAL;
2975 2976 2977 2978 2979 2980
	*sig = u64_to_user_ptr(arg.sigmask);
	*argsz = arg.sigmask_sz;
	*ts = u64_to_user_ptr(arg.ts);
	return 0;
}

J
Jens Axboe 已提交
2981
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
2982 2983
		u32, min_complete, u32, flags, const void __user *, argp,
		size_t, argsz)
J
Jens Axboe 已提交
2984 2985 2986
{
	struct io_ring_ctx *ctx;
	struct fd f;
2987
	long ret;
J
Jens Axboe 已提交
2988

2989
	io_run_task_work();
2990

2991
	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
2992 2993
			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
			       IORING_ENTER_REGISTERED_RING)))
J
Jens Axboe 已提交
2994 2995
		return -EINVAL;

2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006
	/*
	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
	 * need only dereference our task private array to find it.
	 */
	if (flags & IORING_ENTER_REGISTERED_RING) {
		struct io_uring_task *tctx = current->io_uring;

		if (!tctx || fd >= IO_RINGFD_REG_MAX)
			return -EINVAL;
		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
		f.file = tctx->registered_rings[fd];
3007
		f.flags = 0;
3008 3009 3010
	} else {
		f = fdget(fd);
	}
J
Jens Axboe 已提交
3011

3012 3013 3014
	if (unlikely(!f.file))
		return -EBADF;

J
Jens Axboe 已提交
3015
	ret = -EOPNOTSUPP;
3016
	if (unlikely(!io_is_uring_fops(f.file)))
J
Jens Axboe 已提交
3017 3018 3019 3020
		goto out_fput;

	ret = -ENXIO;
	ctx = f.file->private_data;
3021
	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
J
Jens Axboe 已提交
3022 3023
		goto out_fput;

3024
	ret = -EBADFD;
3025
	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
3026 3027
		goto out;

J
Jens Axboe 已提交
3028 3029 3030 3031 3032
	/*
	 * For SQ polling, the thread will do all submissions and completions.
	 * Just return the requested submit count, and wake the thread if
	 * we were asked to.
	 */
3033
	ret = 0;
J
Jens Axboe 已提交
3034
	if (ctx->flags & IORING_SETUP_SQPOLL) {
3035
		io_cqring_overflow_flush(ctx);
3036

3037 3038
		if (unlikely(ctx->sq_data->thread == NULL)) {
			ret = -EOWNERDEAD;
3039
			goto out;
3040
		}
J
Jens Axboe 已提交
3041
		if (flags & IORING_ENTER_SQ_WAKEUP)
3042
			wake_up(&ctx->sq_data->wait);
3043 3044 3045 3046 3047
		if (flags & IORING_ENTER_SQ_WAIT) {
			ret = io_sqpoll_wait_sq(ctx);
			if (ret)
				goto out;
		}
3048
		ret = to_submit;
3049
	} else if (to_submit) {
3050
		ret = io_uring_add_tctx_node(ctx);
3051 3052
		if (unlikely(ret))
			goto out;
3053

J
Jens Axboe 已提交
3054
		mutex_lock(&ctx->uring_lock);
3055 3056
		ret = io_submit_sqes(ctx, to_submit);
		if (ret != to_submit) {
3057
			mutex_unlock(&ctx->uring_lock);
3058
			goto out;
3059 3060 3061 3062
		}
		if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
			goto iopoll_locked;
		mutex_unlock(&ctx->uring_lock);
J
Jens Axboe 已提交
3063 3064
	}
	if (flags & IORING_ENTER_GETEVENTS) {
3065
		int ret2;
3066
		if (ctx->syscall_iopoll) {
3067 3068 3069 3070 3071 3072 3073 3074
			/*
			 * We disallow the app entering submit/complete with
			 * polling, but we still need to lock the ring to
			 * prevent racing with polled issue that got punted to
			 * a workqueue.
			 */
			mutex_lock(&ctx->uring_lock);
iopoll_locked:
3075 3076 3077 3078 3079
			ret2 = io_validate_ext_arg(flags, argp, argsz);
			if (likely(!ret2)) {
				min_complete = min(min_complete,
						   ctx->cq_entries);
				ret2 = io_iopoll_check(ctx, min_complete);
3080 3081
			}
			mutex_unlock(&ctx->uring_lock);
J
Jens Axboe 已提交
3082
		} else {
3083 3084 3085
			const sigset_t __user *sig;
			struct __kernel_timespec __user *ts;

3086 3087 3088 3089 3090 3091 3092
			ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
			if (likely(!ret2)) {
				min_complete = min(min_complete,
						   ctx->cq_entries);
				ret2 = io_cqring_wait(ctx, min_complete, sig,
						      argsz, ts);
			}
J
Jens Axboe 已提交
3093
		}
3094

3095
		if (!ret) {
3096
			ret = ret2;
J
Jens Axboe 已提交
3097

3098 3099 3100 3101 3102 3103 3104 3105
			/*
			 * EBADR indicates that one or more CQE were dropped.
			 * Once the user has been informed we can clear the bit
			 * as they are obviously ok with those drops.
			 */
			if (unlikely(ret2 == -EBADR))
				clear_bit(IO_CHECK_CQ_DROPPED_BIT,
					  &ctx->check_cq);
J
Jens Axboe 已提交
3106
		}
J
Jens Axboe 已提交
3107 3108
	}

3109
out:
3110
	percpu_ref_put(&ctx->refs);
J
Jens Axboe 已提交
3111
out_fput:
3112
	fdput(f);
3113
	return ret;
J
Jens Axboe 已提交
3114 3115 3116 3117 3118
}

static const struct file_operations io_uring_fops = {
	.release	= io_uring_release,
	.mmap		= io_uring_mmap,
3119 3120 3121 3122
#ifndef CONFIG_MMU
	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
#endif
J
Jens Axboe 已提交
3123
	.poll		= io_uring_poll,
3124
#ifdef CONFIG_PROC_FS
3125
	.show_fdinfo	= io_uring_show_fdinfo,
3126
#endif
J
Jens Axboe 已提交
3127 3128
};

3129 3130 3131 3132 3133
bool io_is_uring_fops(struct file *file)
{
	return file->f_op == &io_uring_fops;
}

P
Pavel Begunkov 已提交
3134 3135
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
					 struct io_uring_params *p)
J
Jens Axboe 已提交
3136
{
3137 3138
	struct io_rings *rings;
	size_t size, sq_array_offset;
J
Jens Axboe 已提交
3139

3140 3141 3142 3143
	/* make sure these are sane, as we already accounted them */
	ctx->sq_entries = p->sq_entries;
	ctx->cq_entries = p->cq_entries;

3144
	size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
3145 3146 3147 3148 3149
	if (size == SIZE_MAX)
		return -EOVERFLOW;

	rings = io_mem_alloc(size);
	if (!rings)
J
Jens Axboe 已提交
3150 3151
		return -ENOMEM;

3152 3153 3154 3155 3156 3157
	ctx->rings = rings;
	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
	rings->sq_ring_mask = p->sq_entries - 1;
	rings->cq_ring_mask = p->cq_entries - 1;
	rings->sq_ring_entries = p->sq_entries;
	rings->cq_ring_entries = p->cq_entries;
J
Jens Axboe 已提交
3158

3159 3160 3161 3162
	if (p->flags & IORING_SETUP_SQE128)
		size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
	else
		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3163 3164 3165
	if (size == SIZE_MAX) {
		io_mem_free(ctx->rings);
		ctx->rings = NULL;
J
Jens Axboe 已提交
3166
		return -EOVERFLOW;
3167
	}
J
Jens Axboe 已提交
3168 3169

	ctx->sq_sqes = io_mem_alloc(size);
3170 3171 3172
	if (!ctx->sq_sqes) {
		io_mem_free(ctx->rings);
		ctx->rings = NULL;
J
Jens Axboe 已提交
3173
		return -ENOMEM;
3174
	}
J
Jens Axboe 已提交
3175 3176 3177 3178

	return 0;
}

3179 3180 3181 3182 3183 3184 3185 3186
static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
{
	int ret, fd;

	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
	if (fd < 0)
		return fd;

3187
	ret = io_uring_add_tctx_node(ctx);
3188 3189 3190 3191 3192 3193 3194 3195
	if (ret) {
		put_unused_fd(fd);
		return ret;
	}
	fd_install(fd, file);
	return fd;
}

J
Jens Axboe 已提交
3196 3197 3198 3199 3200 3201
/*
 * Allocate an anonymous fd, this is what constitutes the application
 * visible backing of an io_uring instance. The application mmaps this
 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
 * we have to tie this fd to a socket for file garbage collection purposes.
 */
3202
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
J
Jens Axboe 已提交
3203 3204
{
	struct file *file;
3205
#if defined(CONFIG_UNIX)
J
Jens Axboe 已提交
3206 3207 3208 3209 3210
	int ret;

	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
				&ctx->ring_sock);
	if (ret)
3211
		return ERR_PTR(ret);
J
Jens Axboe 已提交
3212 3213
#endif

3214 3215
	file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
					 O_RDWR | O_CLOEXEC, NULL);
J
Jens Axboe 已提交
3216
#if defined(CONFIG_UNIX)
3217 3218 3219 3220 3221
	if (IS_ERR(file)) {
		sock_release(ctx->ring_sock);
		ctx->ring_sock = NULL;
	} else {
		ctx->ring_sock->file = file;
3222
	}
J
Jens Axboe 已提交
3223
#endif
3224
	return file;
J
Jens Axboe 已提交
3225 3226
}

P
Pavel Begunkov 已提交
3227 3228
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
				  struct io_uring_params __user *params)
J
Jens Axboe 已提交
3229 3230
{
	struct io_ring_ctx *ctx;
3231
	struct file *file;
J
Jens Axboe 已提交
3232 3233
	int ret;

3234
	if (!entries)
J
Jens Axboe 已提交
3235
		return -EINVAL;
3236 3237 3238 3239 3240
	if (entries > IORING_MAX_ENTRIES) {
		if (!(p->flags & IORING_SETUP_CLAMP))
			return -EINVAL;
		entries = IORING_MAX_ENTRIES;
	}
J
Jens Axboe 已提交
3241 3242 3243 3244 3245

	/*
	 * Use twice as many entries for the CQ ring. It's possible for the
	 * application to drive a higher depth than the size of the SQ ring,
	 * since the sqes are only used at submission time. This allows for
3246 3247 3248
	 * some flexibility in overcommitting a bit. If the application has
	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
	 * of CQ ring entries manually.
J
Jens Axboe 已提交
3249 3250
	 */
	p->sq_entries = roundup_pow_of_two(entries);
3251 3252 3253 3254 3255 3256
	if (p->flags & IORING_SETUP_CQSIZE) {
		/*
		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
		 * to a power-of-two, if it isn't already. We do NOT impose
		 * any cq vs sq ring sizing.
		 */
3257
		if (!p->cq_entries)
3258
			return -EINVAL;
3259 3260 3261 3262 3263
		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
			if (!(p->flags & IORING_SETUP_CLAMP))
				return -EINVAL;
			p->cq_entries = IORING_MAX_CQ_ENTRIES;
		}
3264 3265 3266
		p->cq_entries = roundup_pow_of_two(p->cq_entries);
		if (p->cq_entries < p->sq_entries)
			return -EINVAL;
3267 3268 3269
	} else {
		p->cq_entries = 2 * p->sq_entries;
	}
J
Jens Axboe 已提交
3270 3271

	ctx = io_ring_ctx_alloc(p);
J
Jens Axboe 已提交
3272
	if (!ctx)
J
Jens Axboe 已提交
3273
		return -ENOMEM;
3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284

	/*
	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
	 * space applications don't need to do io completion events
	 * polling again, they can rely on io_sq_thread to do polling
	 * work, which can reduce cpu usage and uring_lock contention.
	 */
	if (ctx->flags & IORING_SETUP_IOPOLL &&
	    !(ctx->flags & IORING_SETUP_SQPOLL))
		ctx->syscall_iopoll = 1;

J
Jens Axboe 已提交
3285
	ctx->compat = in_compat_syscall();
J
Jens Axboe 已提交
3286 3287
	if (!capable(CAP_IPC_LOCK))
		ctx->user = get_uid(current_user());
3288

3289
	/*
3290 3291
	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
	 * COOP_TASKRUN is set, then IPIs are never needed by the app.
3292
	 */
3293 3294 3295
	ret = -EINVAL;
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		/* IPI related flags don't make sense with SQPOLL */
3296 3297
		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
				  IORING_SETUP_TASKRUN_FLAG))
3298
			goto err;
3299
		ctx->notify_method = TWA_SIGNAL_NO_IPI;
3300 3301 3302
	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
		ctx->notify_method = TWA_SIGNAL_NO_IPI;
	} else {
3303 3304
		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
			goto err;
3305
		ctx->notify_method = TWA_SIGNAL;
3306
	}
3307

3308 3309 3310 3311 3312 3313
	/*
	 * This is just grabbed for accounting purposes. When a process exits,
	 * the mm is exited and dropped before the files, hence we need to hang
	 * on to this mm purely for the purposes of being able to unaccount
	 * memory (locked/pinned vm). It's not used for anything else.
	 */
3314
	mmgrab(current->mm);
3315
	ctx->mm_account = current->mm;
3316

J
Jens Axboe 已提交
3317 3318 3319 3320
	ret = io_allocate_scq_urings(ctx, p);
	if (ret)
		goto err;

3321
	ret = io_sq_offload_create(ctx, p);
J
Jens Axboe 已提交
3322 3323
	if (ret)
		goto err;
3324
	/* always set a rsrc node */
3325 3326 3327
	ret = io_rsrc_node_switch_start(ctx);
	if (ret)
		goto err;
3328
	io_rsrc_node_switch(ctx, NULL);
J
Jens Axboe 已提交
3329 3330

	memset(&p->sq_off, 0, sizeof(p->sq_off));
3331 3332 3333 3334 3335 3336 3337
	p->sq_off.head = offsetof(struct io_rings, sq.head);
	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
J
Jens Axboe 已提交
3338 3339

	memset(&p->cq_off, 0, sizeof(p->cq_off));
3340 3341 3342 3343 3344 3345
	p->cq_off.head = offsetof(struct io_rings, cq.head);
	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
	p->cq_off.cqes = offsetof(struct io_rings, cqes);
3346
	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
3347

3348 3349
	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
3350
			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
3351
			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
3352
			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
3353 3354
			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
			IORING_FEAT_LINKED_FILE;
3355 3356 3357 3358 3359

	if (copy_to_user(params, p, sizeof(*p))) {
		ret = -EFAULT;
		goto err;
	}
3360

3361 3362 3363 3364 3365 3366
	file = io_uring_get_file(ctx);
	if (IS_ERR(file)) {
		ret = PTR_ERR(file);
		goto err;
	}

3367 3368 3369 3370
	/*
	 * Install ring fd as the very last thing, so we don't risk someone
	 * having closed it before we finish setup
	 */
3371 3372 3373 3374 3375 3376
	ret = io_uring_install_fd(ctx, file);
	if (ret < 0) {
		/* fput will clean it up */
		fput(file);
		return ret;
	}
3377

3378
	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
J
Jens Axboe 已提交
3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401
	return ret;
err:
	io_ring_ctx_wait_and_kill(ctx);
	return ret;
}

/*
 * Sets up an aio uring context, and returns the fd. Applications asks for a
 * ring size, we return the actual sq/cq ring sizes (among other things) in the
 * params structure passed in.
 */
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
	struct io_uring_params p;
	int i;

	if (copy_from_user(&p, params, sizeof(p)))
		return -EFAULT;
	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
		if (p.resv[i])
			return -EINVAL;
	}

J
Jens Axboe 已提交
3402
	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
3403
			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
3404
			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
3405
			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
3406
			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
S
Stefan Roesch 已提交
3407
			IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
J
Jens Axboe 已提交
3408 3409
		return -EINVAL;

3410
	return io_uring_create(entries, &p, params);
J
Jens Axboe 已提交
3411 3412 3413 3414 3415 3416 3417 3418
}

SYSCALL_DEFINE2(io_uring_setup, u32, entries,
		struct io_uring_params __user *, params)
{
	return io_uring_setup(entries, params);
}

P
Pavel Begunkov 已提交
3419 3420
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
			   unsigned nr_args)
3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458
{
	struct io_uring_probe *p;
	size_t size;
	int i, ret;

	size = struct_size(p, ops, nr_args);
	if (size == SIZE_MAX)
		return -EOVERFLOW;
	p = kzalloc(size, GFP_KERNEL);
	if (!p)
		return -ENOMEM;

	ret = -EFAULT;
	if (copy_from_user(p, arg, size))
		goto out;
	ret = -EINVAL;
	if (memchr_inv(p, 0, size))
		goto out;

	p->last_op = IORING_OP_LAST - 1;
	if (nr_args > IORING_OP_LAST)
		nr_args = IORING_OP_LAST;

	for (i = 0; i < nr_args; i++) {
		p->ops[i].op = i;
		if (!io_op_defs[i].not_supported)
			p->ops[i].flags = IO_URING_OP_SUPPORTED;
	}
	p->ops_len = i;

	ret = 0;
	if (copy_to_user(arg, p, size))
		ret = -EFAULT;
out:
	kfree(p);
	return ret;
}

3459 3460
static int io_register_personality(struct io_ring_ctx *ctx)
{
J
Jens Axboe 已提交
3461
	const struct cred *creds;
3462
	u32 id;
J
Jens Axboe 已提交
3463
	int ret;
3464

J
Jens Axboe 已提交
3465
	creds = get_current_cred();
J
Jens Axboe 已提交
3466

3467 3468
	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
3469 3470 3471 3472 3473
	if (ret < 0) {
		put_cred(creds);
		return ret;
	}
	return id;
3474 3475
}

P
Pavel Begunkov 已提交
3476 3477
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
					   void __user *arg, unsigned int nr_args)
3478 3479 3480 3481 3482
{
	struct io_uring_restriction *res;
	size_t size;
	int i, ret;

3483 3484 3485 3486
	/* Restrictions allowed only if rings started disabled */
	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EBADFD;

3487
	/* We allow only a single restrictions registration */
3488
	if (ctx->restrictions.registered)
3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539
		return -EBUSY;

	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
		return -EINVAL;

	size = array_size(nr_args, sizeof(*res));
	if (size == SIZE_MAX)
		return -EOVERFLOW;

	res = memdup_user(arg, size);
	if (IS_ERR(res))
		return PTR_ERR(res);

	ret = 0;

	for (i = 0; i < nr_args; i++) {
		switch (res[i].opcode) {
		case IORING_RESTRICTION_REGISTER_OP:
			if (res[i].register_op >= IORING_REGISTER_LAST) {
				ret = -EINVAL;
				goto out;
			}

			__set_bit(res[i].register_op,
				  ctx->restrictions.register_op);
			break;
		case IORING_RESTRICTION_SQE_OP:
			if (res[i].sqe_op >= IORING_OP_LAST) {
				ret = -EINVAL;
				goto out;
			}

			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
			break;
		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
			break;
		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
			break;
		default:
			ret = -EINVAL;
			goto out;
		}
	}

out:
	/* Reset all restrictions if an error happened */
	if (ret != 0)
		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
	else
3540
		ctx->restrictions.registered = true;
3541 3542 3543 3544 3545

	kfree(res);
	return ret;
}

3546 3547 3548 3549 3550 3551 3552 3553
static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
		return -EBADFD;

	if (ctx->restrictions.registered)
		ctx->restricted = 1;

3554 3555 3556
	ctx->flags &= ~IORING_SETUP_R_DISABLED;
	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
		wake_up(&ctx->sq_data->wait);
3557 3558 3559
	return 0;
}

P
Pavel Begunkov 已提交
3560 3561
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
				       void __user *arg, unsigned len)
3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576
{
	struct io_uring_task *tctx = current->io_uring;
	cpumask_var_t new_mask;
	int ret;

	if (!tctx || !tctx->io_wq)
		return -EINVAL;

	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
		return -ENOMEM;

	cpumask_clear(new_mask);
	if (len > cpumask_size())
		len = cpumask_size();

3577 3578 3579 3580 3581 3582 3583 3584 3585
	if (in_compat_syscall()) {
		ret = compat_get_bitmap(cpumask_bits(new_mask),
					(const compat_ulong_t __user *)arg,
					len * 8 /* CHAR_BIT */);
	} else {
		ret = copy_from_user(new_mask, arg, len);
	}

	if (ret) {
3586 3587 3588 3589 3590 3591 3592 3593 3594
		free_cpumask_var(new_mask);
		return -EFAULT;
	}

	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
	free_cpumask_var(new_mask);
	return ret;
}

P
Pavel Begunkov 已提交
3595
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
3596 3597 3598 3599 3600 3601 3602 3603 3604
{
	struct io_uring_task *tctx = current->io_uring;

	if (!tctx || !tctx->io_wq)
		return -EINVAL;

	return io_wq_cpu_affinity(tctx->io_wq, NULL);
}

P
Pavel Begunkov 已提交
3605 3606
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
					       void __user *arg)
3607
	__must_hold(&ctx->uring_lock)
3608
{
3609
	struct io_tctx_node *node;
3610 3611
	struct io_uring_task *tctx = NULL;
	struct io_sq_data *sqd = NULL;
3612 3613 3614 3615 3616 3617 3618 3619 3620
	__u32 new_count[2];
	int i, ret;

	if (copy_from_user(new_count, arg, sizeof(new_count)))
		return -EFAULT;
	for (i = 0; i < ARRAY_SIZE(new_count); i++)
		if (new_count[i] > INT_MAX)
			return -EINVAL;

3621 3622 3623
	if (ctx->flags & IORING_SETUP_SQPOLL) {
		sqd = ctx->sq_data;
		if (sqd) {
3624 3625 3626 3627 3628
			/*
			 * Observe the correct sqd->lock -> ctx->uring_lock
			 * ordering. Fine to drop uring_lock here, we hold
			 * a ref to the ctx.
			 */
3629
			refcount_inc(&sqd->refs);
3630
			mutex_unlock(&ctx->uring_lock);
3631
			mutex_lock(&sqd->lock);
3632
			mutex_lock(&ctx->uring_lock);
3633 3634
			if (sqd->thread)
				tctx = sqd->thread->io_uring;
3635 3636 3637 3638 3639
		}
	} else {
		tctx = current->io_uring;
	}

3640
	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
3641

3642 3643 3644
	for (i = 0; i < ARRAY_SIZE(new_count); i++)
		if (new_count[i])
			ctx->iowq_limits[i] = new_count[i];
3645 3646 3647 3648 3649 3650 3651 3652 3653
	ctx->iowq_limits_set = true;

	if (tctx && tctx->io_wq) {
		ret = io_wq_max_workers(tctx->io_wq, new_count);
		if (ret)
			goto err;
	} else {
		memset(new_count, 0, sizeof(new_count));
	}
3654

3655
	if (sqd) {
3656
		mutex_unlock(&sqd->lock);
3657 3658
		io_put_sq_data(sqd);
	}
3659 3660 3661 3662

	if (copy_to_user(arg, new_count, sizeof(new_count)))
		return -EFAULT;

3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678
	/* that's it for SQPOLL, only the SQPOLL task creates requests */
	if (sqd)
		return 0;

	/* now propagate the restriction to all registered users */
	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
		struct io_uring_task *tctx = node->task->io_uring;

		if (WARN_ON_ONCE(!tctx->io_wq))
			continue;

		for (i = 0; i < ARRAY_SIZE(new_count); i++)
			new_count[i] = ctx->iowq_limits[i];
		/* ignore errors, it always returns zero anyway */
		(void)io_wq_max_workers(tctx->io_wq, new_count);
	}
3679
	return 0;
3680
err:
3681
	if (sqd) {
3682
		mutex_unlock(&sqd->lock);
3683 3684
		io_put_sq_data(sqd);
	}
3685
	return ret;
3686 3687
}

3688 3689
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
			       void __user *arg, unsigned nr_args)
3690 3691
	__releases(ctx->uring_lock)
	__acquires(ctx->uring_lock)
3692 3693 3694
{
	int ret;

3695 3696 3697 3698 3699 3700 3701 3702
	/*
	 * We're inside the ring mutex, if the ref is already dying, then
	 * someone else killed the ctx or is already going through
	 * io_uring_register().
	 */
	if (percpu_ref_is_dying(&ctx->refs))
		return -ENXIO;

3703 3704 3705 3706 3707 3708 3709 3710
	if (ctx->restricted) {
		if (opcode >= IORING_REGISTER_LAST)
			return -EINVAL;
		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
		if (!test_bit(opcode, ctx->restrictions.register_op))
			return -EACCES;
	}

3711 3712
	switch (opcode) {
	case IORING_REGISTER_BUFFERS:
3713 3714 3715
		ret = -EFAULT;
		if (!arg)
			break;
3716
		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
3717 3718 3719 3720 3721
		break;
	case IORING_UNREGISTER_BUFFERS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
3722
		ret = io_sqe_buffers_unregister(ctx);
3723
		break;
J
Jens Axboe 已提交
3724
	case IORING_REGISTER_FILES:
3725 3726 3727
		ret = -EFAULT;
		if (!arg)
			break;
3728
		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
J
Jens Axboe 已提交
3729 3730 3731 3732 3733 3734 3735
		break;
	case IORING_UNREGISTER_FILES:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_sqe_files_unregister(ctx);
		break;
3736
	case IORING_REGISTER_FILES_UPDATE:
3737
		ret = io_register_files_update(ctx, arg, nr_args);
3738
		break;
3739 3740 3741 3742
	case IORING_REGISTER_EVENTFD:
		ret = -EINVAL;
		if (nr_args != 1)
			break;
3743 3744 3745 3746 3747
		ret = io_eventfd_register(ctx, arg, 0);
		break;
	case IORING_REGISTER_EVENTFD_ASYNC:
		ret = -EINVAL;
		if (nr_args != 1)
3748
			break;
3749
		ret = io_eventfd_register(ctx, arg, 1);
3750 3751 3752 3753 3754 3755 3756
		break;
	case IORING_UNREGISTER_EVENTFD:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_eventfd_unregister(ctx);
		break;
3757 3758 3759 3760 3761 3762
	case IORING_REGISTER_PROBE:
		ret = -EINVAL;
		if (!arg || nr_args > 256)
			break;
		ret = io_probe(ctx, arg, nr_args);
		break;
3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774
	case IORING_REGISTER_PERSONALITY:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_register_personality(ctx);
		break;
	case IORING_UNREGISTER_PERSONALITY:
		ret = -EINVAL;
		if (arg)
			break;
		ret = io_unregister_personality(ctx, nr_args);
		break;
3775 3776 3777 3778 3779 3780
	case IORING_REGISTER_ENABLE_RINGS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_register_enable_rings(ctx);
		break;
3781 3782 3783
	case IORING_REGISTER_RESTRICTIONS:
		ret = io_register_restrictions(ctx, arg, nr_args);
		break;
3784 3785 3786 3787 3788 3789 3790 3791 3792
	case IORING_REGISTER_FILES2:
		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
		break;
	case IORING_REGISTER_FILES_UPDATE2:
		ret = io_register_rsrc_update(ctx, arg, nr_args,
					      IORING_RSRC_FILE);
		break;
	case IORING_REGISTER_BUFFERS2:
		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
3793
		break;
3794 3795 3796
	case IORING_REGISTER_BUFFERS_UPDATE:
		ret = io_register_rsrc_update(ctx, arg, nr_args,
					      IORING_RSRC_BUFFER);
3797
		break;
3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809
	case IORING_REGISTER_IOWQ_AFF:
		ret = -EINVAL;
		if (!arg || !nr_args)
			break;
		ret = io_register_iowq_aff(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_IOWQ_AFF:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_unregister_iowq_aff(ctx);
		break;
3810 3811 3812 3813 3814 3815
	case IORING_REGISTER_IOWQ_MAX_WORKERS:
		ret = -EINVAL;
		if (!arg || nr_args != 2)
			break;
		ret = io_register_iowq_max_workers(ctx, arg);
		break;
3816 3817 3818 3819 3820 3821
	case IORING_REGISTER_RING_FDS:
		ret = io_ringfd_register(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_RING_FDS:
		ret = io_ringfd_unregister(ctx, arg, nr_args);
		break;
3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833
	case IORING_REGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_pbuf_ring(ctx, arg);
		break;
	case IORING_UNREGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_unregister_pbuf_ring(ctx, arg);
		break;
3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853
	default:
		ret = -EINVAL;
		break;
	}

	return ret;
}

SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
		void __user *, arg, unsigned int, nr_args)
{
	struct io_ring_ctx *ctx;
	long ret = -EBADF;
	struct fd f;

	f = fdget(fd);
	if (!f.file)
		return -EBADF;

	ret = -EOPNOTSUPP;
3854
	if (!io_is_uring_fops(f.file))
3855 3856 3857 3858
		goto out_fput;

	ctx = f.file->private_data;

3859 3860
	io_run_task_work();

3861 3862 3863
	mutex_lock(&ctx->uring_lock);
	ret = __io_uring_register(ctx, opcode, arg, nr_args);
	mutex_unlock(&ctx->uring_lock);
U
Usama Arif 已提交
3864
	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
3865 3866 3867 3868 3869
out_fput:
	fdput(f);
	return ret;
}

J
Jens Axboe 已提交
3870 3871
static int __init io_uring_init(void)
{
3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886
#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
} while (0)

#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
P
Pavel Begunkov 已提交
3887
	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
3888 3889 3890 3891 3892
	BUILD_BUG_SQE_ELEM(24, __u32,  len);
	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
3893 3894
	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
3895 3896 3897 3898 3899 3900 3901 3902
	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
P
Pavel Begunkov 已提交
3903
	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
3904 3905
	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
3906
	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
3907
	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
P
Pavel Begunkov 已提交
3908
	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
3909
	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
3910
	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
3911

3912 3913 3914 3915
	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
		     sizeof(struct io_uring_rsrc_update));
	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
		     sizeof(struct io_uring_rsrc_update2));
3916 3917

	/* ->buf_index is u16 */
3918 3919 3920
	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
		     offsetof(struct io_uring_buf_ring, tail));
3921

3922 3923
	/* should fit into one byte */
	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
3924 3925
	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
3926

3927
	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
3928

3929 3930
	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));

3931
	io_uring_optable_init();
3932

3933 3934
	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
				SLAB_ACCOUNT);
J
Jens Axboe 已提交
3935 3936 3937
	return 0;
};
__initcall(io_uring_init);