poll.c 25.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/hashtable.h>
#include <linux/io_uring.h>

#include <trace/events/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "io_uring.h"
#include "refs.h"
#include "opdef.h"
19
#include "kbuf.h"
20
#include "poll.h"
21
#include "cancel.h"
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36

struct io_poll_update {
	struct file			*file;
	u64				old_user_data;
	u64				new_user_data;
	__poll_t			events;
	bool				update_events;
	bool				update_user_data;
};

struct io_poll_table {
	struct poll_table_struct pt;
	struct io_kiocb *req;
	int nr_entries;
	int error;
37
	bool owning;
38 39
	/* output value, set only if arm poll returns >0 */
	__poll_t result_mask;
40 41 42 43 44
};

#define IO_POLL_CANCEL_FLAG	BIT(31)
#define IO_POLL_REF_MASK	GENMASK(30, 0)

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
#define IO_WQE_F_DOUBLE		1

static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
{
	unsigned long priv = (unsigned long)wqe->private;

	return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE);
}

static inline bool wqe_is_double(struct wait_queue_entry *wqe)
{
	unsigned long priv = (unsigned long)wqe->private;

	return priv & IO_WQE_F_DOUBLE;
}

61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
/*
 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
 * bump it and acquire ownership. It's disallowed to modify requests while not
 * owning it, that prevents from races for enqueueing task_work's and b/w
 * arming poll and wakeups.
 */
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}

static void io_poll_mark_cancelled(struct io_kiocb *req)
{
	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
}

static struct io_poll *io_poll_get_double(struct io_kiocb *req)
{
	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
	if (req->opcode == IORING_OP_POLL_ADD)
		return req->async_data;
	return req->apoll->double_poll;
}

static struct io_poll *io_poll_get_single(struct io_kiocb *req)
{
	if (req->opcode == IORING_OP_POLL_ADD)
88
		return io_kiocb_to_cmd(req, struct io_poll);
89 90 91 92 93
	return &req->apoll->poll;
}

static void io_poll_req_insert(struct io_kiocb *req)
{
94 95 96
	struct io_hash_table *table = &req->ctx->cancel_table;
	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
	struct io_hash_bucket *hb = &table->hbs[index];
97

98 99 100 101 102 103 104
	spin_lock(&hb->lock);
	hlist_add_head(&req->hash_node, &hb->list);
	spin_unlock(&hb->lock);
}

static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
105 106 107
	struct io_hash_table *table = &req->ctx->cancel_table;
	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
	spinlock_t *lock = &table->hbs[index].lock;
108 109 110 111

	spin_lock(lock);
	hash_del(&req->hash_node);
	spin_unlock(lock);
112 113
}

114 115 116 117 118
static void io_poll_req_insert_locked(struct io_kiocb *req)
{
	struct io_hash_table *table = &req->ctx->cancel_table_locked;
	u32 index = hash_long(req->cqe.user_data, table->hash_bits);

119 120
	lockdep_assert_held(&req->ctx->uring_lock);

121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
	hlist_add_head(&req->hash_node, &table->hbs[index].list);
}

static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
{
	struct io_ring_ctx *ctx = req->ctx;

	if (req->flags & REQ_F_HASH_LOCKED) {
		/*
		 * ->cancel_table_locked is protected by ->uring_lock in
		 * contrast to per bucket spinlocks. Likely, tctx_task_work()
		 * already grabbed the mutex for us, but there is a chance it
		 * failed.
		 */
		io_tw_lock(ctx, locked);
		hash_del(&req->hash_node);
137
		req->flags &= ~REQ_F_HASH_LOCKED;
138 139 140 141 142
	} else {
		io_poll_req_delete(req, ctx);
	}
}

143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
			      wait_queue_func_t wake_func)
{
	poll->head = NULL;
#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
	/* mask in events that we always want/need */
	poll->events = events | IO_POLL_UNMASK;
	INIT_LIST_HEAD(&poll->wait.entry);
	init_waitqueue_func_entry(&poll->wait, wake_func);
}

static inline void io_poll_remove_entry(struct io_poll *poll)
{
	struct wait_queue_head *head = smp_load_acquire(&poll->head);

	if (head) {
		spin_lock_irq(&head->lock);
		list_del_init(&poll->wait.entry);
		poll->head = NULL;
		spin_unlock_irq(&head->lock);
	}
}

static void io_poll_remove_entries(struct io_kiocb *req)
{
	/*
	 * Nothing to do if neither of those flags are set. Avoid dipping
	 * into the poll/apoll/double cachelines if we can.
	 */
	if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
		return;

	/*
	 * While we hold the waitqueue lock and the waitqueue is nonempty,
	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
	 * lock in the first place can race with the waitqueue being freed.
	 *
	 * We solve this as eventpoll does: by taking advantage of the fact that
	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
	 * we enter rcu_read_lock() and see that the pointer to the queue is
	 * non-NULL, we can then lock it without the memory being freed out from
	 * under us.
	 *
	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
	 * case the caller deletes the entry from the queue, leaving it empty.
	 * In that case, only RCU prevents the queue memory from being freed.
	 */
	rcu_read_lock();
	if (req->flags & REQ_F_SINGLE_POLL)
		io_poll_remove_entry(io_poll_get_single(req));
	if (req->flags & REQ_F_DOUBLE_POLL)
		io_poll_remove_entry(io_poll_get_double(req));
	rcu_read_unlock();
}

198 199 200
enum {
	IOU_POLL_DONE = 0,
	IOU_POLL_NO_ACTION = 1,
201
	IOU_POLL_REMOVE_POLL_USE_RES = 2,
202 203
};

204 205 206 207
/*
 * All poll tw should go through this. Checks for poll events, manages
 * references, does rewait, etc.
 *
208 209 210
 * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require,
 * which is either spurious wakeup or multishot CQE is served.
 * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res.
211 212
 * IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot poll and that the result
 * is stored in req->cqe.
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
 */
static int io_poll_check_events(struct io_kiocb *req, bool *locked)
{
	struct io_ring_ctx *ctx = req->ctx;
	int v, ret;

	/* req->task == current here, checking PF_EXITING is safe */
	if (unlikely(req->task->flags & PF_EXITING))
		return -ECANCELED;

	do {
		v = atomic_read(&req->poll_refs);

		/* tw handler should be the owner, and so have some references */
		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
228
			return IOU_POLL_DONE;
229 230 231
		if (v & IO_POLL_CANCEL_FLAG)
			return -ECANCELED;

232
		/* the mask was stashed in __io_poll_execute */
233 234 235 236 237 238 239 240
		if (!req->cqe.res) {
			struct poll_table_struct pt = { ._key = req->apoll_events };
			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
		}

		if ((unlikely(!req->cqe.res)))
			continue;
		if (req->apoll_events & EPOLLONESHOT)
241
			return IOU_POLL_DONE;
242 243 244 245 246 247

		/* multishot, just fill a CQE and proceed */
		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
			__poll_t mask = mangle_poll(req->cqe.res &
						    req->apoll_events);

248
			if (!io_post_aux_cqe(ctx, req->cqe.user_data,
249 250 251 252
					     mask, IORING_CQE_F_MORE, false)) {
				io_req_set_res(req, mask, 0);
				return IOU_POLL_REMOVE_POLL_USE_RES;
			}
253 254
		} else {
			ret = io_poll_issue(req, locked);
255 256
			if (ret == IOU_STOP_MULTISHOT)
				return IOU_POLL_REMOVE_POLL_USE_RES;
257
			if (ret < 0)
258 259
				return ret;
		}
260 261 262 263 264 265 266

		/*
		 * Release all references, retry if someone tried to restart
		 * task_work while we were executing it.
		 */
	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));

267
	return IOU_POLL_NO_ACTION;
268 269 270 271 272 273 274
}

static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
	int ret;

	ret = io_poll_check_events(req, locked);
275
	if (ret == IOU_POLL_NO_ACTION)
276 277
		return;

278
	if (ret == IOU_POLL_DONE) {
279
		struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
280
		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
281
	} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
282 283 284 285 286
		req->cqe.res = ret;
		req_set_fail(req);
	}

	io_poll_remove_entries(req);
287 288
	io_poll_tw_hash_eject(req, locked);

289 290
	io_req_set_res(req, req->cqe.res, 0);
	io_req_task_complete(req, locked);
291 292 293 294 295 296 297
}

static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
{
	int ret;

	ret = io_poll_check_events(req, locked);
298
	if (ret == IOU_POLL_NO_ACTION)
299 300 301
		return;

	io_poll_remove_entries(req);
302
	io_poll_tw_hash_eject(req, locked);
303

304 305 306
	if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
		io_req_complete_post(req);
	else if (ret == IOU_POLL_DONE)
307 308 309 310 311
		io_req_task_submit(req, locked);
	else
		io_req_complete_failed(req, ret);
}

312
static void __io_poll_execute(struct io_kiocb *req, int mask)
313 314 315 316 317 318 319 320 321 322 323 324 325
{
	io_req_set_res(req, mask, 0);
	/*
	 * This is useful for poll that is armed on behalf of another
	 * request, and where the wakeup path could be on a different
	 * CPU. We want to avoid pulling in req->apoll->events for that
	 * case.
	 */
	if (req->opcode == IORING_OP_POLL_ADD)
		req->io_task_work.func = io_poll_task_func;
	else
		req->io_task_work.func = io_apoll_task_func;

326
	trace_io_uring_task_add(req, mask);
327 328 329
	io_req_task_work_add(req);
}

330
static inline void io_poll_execute(struct io_kiocb *req, int res)
331 332
{
	if (io_poll_get_ownership(req))
333
		__io_poll_execute(req, res);
334 335 336 337 338 339
}

static void io_poll_cancel_req(struct io_kiocb *req)
{
	io_poll_mark_cancelled(req);
	/* kick tw, which should complete the request */
340
	io_poll_execute(req, 0);
341 342 343 344
}

#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)

345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
{
	io_poll_mark_cancelled(req);
	/* we have to kick tw in case it's not already */
	io_poll_execute(req, 0);

	/*
	 * If the waitqueue is being freed early but someone is already
	 * holds ownership over it, we have to tear down the request as
	 * best we can. That means immediately removing the request from
	 * its waitqueue and preventing all further accesses to the
	 * waitqueue via the request.
	 */
	list_del_init(&poll->wait.entry);

	/*
	 * Careful: this *must* be the last step, since as soon
	 * as req->head is NULL'ed out, the request can be
	 * completed and freed, since aio_poll_complete_work()
	 * will no longer need to take the waitqueue lock.
	 */
	smp_store_release(&poll->head, NULL);
	return 1;
}

370 371 372 373 374 375 376
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
			void *key)
{
	struct io_kiocb *req = wqe_to_req(wait);
	struct io_poll *poll = container_of(wait, struct io_poll, wait);
	__poll_t mask = key_to_poll(key);

377 378
	if (unlikely(mask & POLLFREE))
		return io_pollfree_wake(req, poll);
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393

	/* for instances that support it check for an event match first */
	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
		return 0;

	if (io_poll_get_ownership(req)) {
		/* optional, saves extra locking for removal in tw handler */
		if (mask && poll->events & EPOLLONESHOT) {
			list_del_init(&poll->wait.entry);
			poll->head = NULL;
			if (wqe_is_double(wait))
				req->flags &= ~REQ_F_DOUBLE_POLL;
			else
				req->flags &= ~REQ_F_SINGLE_POLL;
		}
394
		__io_poll_execute(req, mask);
395 396 397 398
	}
	return 1;
}

399 400
/* fails only when polling is already completing by the first entry */
static bool io_poll_double_prepare(struct io_kiocb *req)
401 402 403 404 405 406 407
{
	struct wait_queue_head *head;
	struct io_poll *poll = io_poll_get_single(req);

	/* head is RCU protected, see io_poll_remove_entries() comments */
	rcu_read_lock();
	head = smp_load_acquire(&poll->head);
408
	/*
409 410 411 412
	 * poll arm might not hold ownership and so race for req->flags with
	 * io_poll_wake(). There is only one poll entry queued, serialise with
	 * it by taking its head lock. As we're still arming the tw hanlder
	 * is not going to be run, so there are no races with it.
413
	 */
414
	if (head) {
415
		spin_lock_irq(&head->lock);
416 417 418
		req->flags |= REQ_F_DOUBLE_POLL;
		if (req->opcode == IORING_OP_POLL_ADD)
			req->flags |= REQ_F_ASYNC_DATA;
419
		spin_unlock_irq(&head->lock);
420
	}
421
	rcu_read_unlock();
422
	return !!head;
423 424
}

425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
			    struct wait_queue_head *head,
			    struct io_poll **poll_ptr)
{
	struct io_kiocb *req = pt->req;
	unsigned long wqe_private = (unsigned long) req;

	/*
	 * The file being polled uses multiple waitqueues for poll handling
	 * (e.g. one for read, one for write). Setup a separate io_poll
	 * if this happens.
	 */
	if (unlikely(pt->nr_entries)) {
		struct io_poll *first = poll;

		/* double add on the same waitqueue head, ignore */
		if (first->head == head)
			return;
		/* already have a 2nd entry, fail a third attempt */
		if (*poll_ptr) {
			if ((*poll_ptr)->head == head)
				return;
			pt->error = -EINVAL;
			return;
		}

		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
		if (!poll) {
			pt->error = -ENOMEM;
			return;
		}
456

457
		/* mark as double wq entry */
458
		wqe_private |= IO_WQE_F_DOUBLE;
459
		io_init_poll_iocb(poll, first->events, first->wait.func);
460 461 462 463 464
		if (!io_poll_double_prepare(req)) {
			/* the request is completing, just back off */
			kfree(poll);
			return;
		}
465
		*poll_ptr = poll;
466 467 468
	} else {
		/* fine to modify, there is no poll queued to race with us */
		req->flags |= REQ_F_SINGLE_POLL;
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
	}

	pt->nr_entries++;
	poll->head = head;
	poll->wait.private = (void *) wqe_private;

	if (poll->events & EPOLLEXCLUSIVE)
		add_wait_queue_exclusive(head, &poll->wait);
	else
		add_wait_queue(head, &poll->wait);
}

static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
			       struct poll_table_struct *p)
{
	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
485
	struct io_poll *poll = io_kiocb_to_cmd(pt->req, struct io_poll);
486 487 488 489 490

	__io_queue_proc(poll, pt, head,
			(struct io_poll **) &pt->req->async_data);
}

491 492 493 494 495 496
static bool io_poll_can_finish_inline(struct io_kiocb *req,
				      struct io_poll_table *pt)
{
	return pt->owning || io_poll_get_ownership(req);
}

497 498 499 500 501 502
/*
 * Returns 0 when it's handed over for polling. The caller owns the requests if
 * it returns non-zero, but otherwise should not touch it. Negative values
 * contain an error code. When the result is >0, the polling has completed
 * inline and ipt.result_mask is set to the mask.
 */
503 504
static int __io_arm_poll_handler(struct io_kiocb *req,
				 struct io_poll *poll,
505 506
				 struct io_poll_table *ipt, __poll_t mask,
				 unsigned issue_flags)
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
{
	struct io_ring_ctx *ctx = req->ctx;
	int v;

	INIT_HLIST_NODE(&req->hash_node);
	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
	io_init_poll_iocb(poll, mask, io_poll_wake);
	poll->file = req->file;
	req->apoll_events = poll->events;

	ipt->pt._key = mask;
	ipt->req = req;
	ipt->error = 0;
	ipt->nr_entries = 0;
	/*
522 523 524 525 526 527 528 529 530
	 * Polling is either completed here or via task_work, so if we're in the
	 * task context we're naturally serialised with tw by merit of running
	 * the same task. When it's io-wq, take the ownership to prevent tw
	 * from running. However, when we're in the task context, skip taking
	 * it as an optimisation.
	 *
	 * Note: even though the request won't be completed/freed, without
	 * ownership we still can race with io_poll_wake().
	 * io_poll_can_finish_inline() tries to deal with that.
531
	 */
532 533
	ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
	atomic_set(&req->poll_refs, (int)ipt->owning);
534 535 536 537 538

	/* io-wq doesn't hold uring_lock */
	if (issue_flags & IO_URING_F_UNLOCKED)
		req->flags &= ~REQ_F_HASH_LOCKED;

539 540
	mask = vfs_poll(req->file, &ipt->pt) & poll->events;

541 542 543
	if (unlikely(ipt->error || !ipt->nr_entries)) {
		io_poll_remove_entries(req);

544 545 546 547
		if (!io_poll_can_finish_inline(req, ipt)) {
			io_poll_mark_cancelled(req);
			return 0;
		} else if (mask && (poll->events & EPOLLET)) {
548 549 550
			ipt->result_mask = mask;
			return 1;
		}
551
		return ipt->error ?: -EINVAL;
552 553
	}

554 555
	if (mask &&
	   ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
556 557
		if (!io_poll_can_finish_inline(req, ipt))
			return 0;
558
		io_poll_remove_entries(req);
559
		ipt->result_mask = mask;
560
		/* no one else has access to the req, forget about the ref */
561
		return 1;
562
	}
563

564 565 566 567
	if (req->flags & REQ_F_HASH_LOCKED)
		io_poll_req_insert_locked(req);
	else
		io_poll_req_insert(req);
568

569 570
	if (mask && (poll->events & EPOLLET) &&
	    io_poll_can_finish_inline(req, ipt)) {
571
		__io_poll_execute(req, mask);
572 573 574
		return 0;
	}

575 576 577 578 579 580 581 582 583
	if (ipt->owning) {
		/*
		 * Release ownership. If someone tried to queue a tw while it was
		 * locked, kick it off for them.
		 */
		v = atomic_dec_return(&req->poll_refs);
		if (unlikely(v & IO_POLL_REF_MASK))
			__io_poll_execute(req, 0);
	}
584 585 586 587 588 589 590 591 592 593 594 595
	return 0;
}

static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
			       struct poll_table_struct *p)
{
	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
	struct async_poll *apoll = pt->req->apoll;

	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}

596 597 598 599
static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
					     unsigned issue_flags)
{
	struct io_ring_ctx *ctx = req->ctx;
600
	struct io_cache_entry *entry;
601 602 603 604 605 606
	struct async_poll *apoll;

	if (req->flags & REQ_F_POLLED) {
		apoll = req->apoll;
		kfree(apoll->double_poll);
	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
607 608
		   (entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) {
		apoll = container_of(entry, struct async_poll, cache);
609 610 611 612 613 614 615 616 617 618
	} else {
		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
		if (unlikely(!apoll))
			return NULL;
	}
	apoll->double_poll = NULL;
	req->apoll = apoll;
	return apoll;
}

619 620 621 622 623
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{
	const struct io_op_def *def = &io_op_defs[req->opcode];
	struct async_poll *apoll;
	struct io_poll_table ipt;
624
	__poll_t mask = POLLPRI | POLLERR | EPOLLET;
625 626
	int ret;

627 628 629 630
	/*
	 * apoll requests already grab the mutex to complete in the tw handler,
	 * so removal from the mutex-backed hash is free, use it by default.
	 */
631
	req->flags |= REQ_F_HASH_LOCKED;
632

633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
	if (!def->pollin && !def->pollout)
		return IO_APOLL_ABORTED;
	if (!file_can_poll(req->file))
		return IO_APOLL_ABORTED;
	if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
		return IO_APOLL_ABORTED;
	if (!(req->flags & REQ_F_APOLL_MULTISHOT))
		mask |= EPOLLONESHOT;

	if (def->pollin) {
		mask |= EPOLLIN | EPOLLRDNORM;

		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
		if (req->flags & REQ_F_CLEAR_POLLIN)
			mask &= ~EPOLLIN;
	} else {
		mask |= EPOLLOUT | EPOLLWRNORM;
	}
	if (def->poll_exclusive)
		mask |= EPOLLEXCLUSIVE;
653 654 655 656

	apoll = io_req_alloc_apoll(req, issue_flags);
	if (!apoll)
		return IO_APOLL_ABORTED;
657 658 659 660 661
	req->flags |= REQ_F_POLLED;
	ipt.pt._qproc = io_async_queue_proc;

	io_kbuf_recycle(req, issue_flags);

662
	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags);
663 664
	if (ret)
		return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED;
665
	trace_io_uring_poll_arm(req, mask, apoll->poll.events);
666 667 668
	return IO_APOLL_OK;
}

669 670 671
static __cold bool io_poll_remove_all_table(struct task_struct *tsk,
					    struct io_hash_table *table,
					    bool cancel_all)
672
{
673
	unsigned nr_buckets = 1U << table->hash_bits;
674 675 676 677 678
	struct hlist_node *tmp;
	struct io_kiocb *req;
	bool found = false;
	int i;

679 680
	for (i = 0; i < nr_buckets; i++) {
		struct io_hash_bucket *hb = &table->hbs[i];
681

682 683
		spin_lock(&hb->lock);
		hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
684 685 686 687 688 689
			if (io_match_task_safe(req, tsk, cancel_all)) {
				hlist_del_init(&req->hash_node);
				io_poll_cancel_req(req);
				found = true;
			}
		}
690
		spin_unlock(&hb->lock);
691 692 693 694
	}
	return found;
}

695 696 697 698 699 700 701
/*
 * Returns true if we found and killed one or more poll requests
 */
__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
			       bool cancel_all)
	__must_hold(&ctx->uring_lock)
{
702 703 704 705 706
	bool ret;

	ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all);
	ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
	return ret;
707 708
}

709
static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
710
				     struct io_cancel_data *cd,
711
				     struct io_hash_table *table,
712
				     struct io_hash_bucket **out_bucket)
713 714
{
	struct io_kiocb *req;
715 716
	u32 index = hash_long(cd->data, table->hash_bits);
	struct io_hash_bucket *hb = &table->hbs[index];
717

718 719
	*out_bucket = NULL;

720 721
	spin_lock(&hb->lock);
	hlist_for_each_entry(req, &hb->list, hash_node) {
722 723 724 725 726 727 728 729 730
		if (cd->data != req->cqe.user_data)
			continue;
		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
			continue;
		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
			if (cd->seq == req->work.cancel_seq)
				continue;
			req->work.cancel_seq = cd->seq;
		}
731
		*out_bucket = hb;
732 733
		return req;
	}
734
	spin_unlock(&hb->lock);
735 736 737 738
	return NULL;
}

static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
739
					  struct io_cancel_data *cd,
740
					  struct io_hash_table *table,
741
					  struct io_hash_bucket **out_bucket)
742
{
743
	unsigned nr_buckets = 1U << table->hash_bits;
744 745 746
	struct io_kiocb *req;
	int i;

747 748
	*out_bucket = NULL;

749 750
	for (i = 0; i < nr_buckets; i++) {
		struct io_hash_bucket *hb = &table->hbs[i];
751

752 753
		spin_lock(&hb->lock);
		hlist_for_each_entry(req, &hb->list, hash_node) {
754 755 756 757 758 759
			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
			    req->file != cd->file)
				continue;
			if (cd->seq == req->work.cancel_seq)
				continue;
			req->work.cancel_seq = cd->seq;
760
			*out_bucket = hb;
761 762
			return req;
		}
763
		spin_unlock(&hb->lock);
764 765 766 767
	}
	return NULL;
}

768
static int io_poll_disarm(struct io_kiocb *req)
769
{
770 771
	if (!req)
		return -ENOENT;
772
	if (!io_poll_get_ownership(req))
773
		return -EALREADY;
774 775
	io_poll_remove_entries(req);
	hash_del(&req->hash_node);
776
	return 0;
777 778
}

779
static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
780
			    struct io_hash_table *table)
781
{
782
	struct io_hash_bucket *bucket;
783 784 785
	struct io_kiocb *req;

	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
786
		req = io_poll_file_find(ctx, cd, table, &bucket);
787
	else
788
		req = io_poll_find(ctx, false, cd, table, &bucket);
789 790 791 792 793 794

	if (req)
		io_poll_cancel_req(req);
	if (bucket)
		spin_unlock(&bucket->lock);
	return req ? 0 : -ENOENT;
795 796
}

797 798
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
		   unsigned issue_flags)
799
{
800 801 802 803 804 805 806 807 808 809
	int ret;

	ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table);
	if (ret != -ENOENT)
		return ret;

	io_ring_submit_lock(ctx, issue_flags);
	ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked);
	io_ring_submit_unlock(ctx, issue_flags);
	return ret;
810 811
}

812 813 814 815 816 817 818 819 820 821 822
static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
				     unsigned int flags)
{
	u32 events;

	events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
	events = swahw32(events);
#endif
	if (!(flags & IORING_POLL_ADD_MULTI))
		events |= EPOLLONESHOT;
823 824 825 826
	if (!(flags & IORING_POLL_ADD_LEVEL))
		events |= EPOLLET;
	return demangle_poll(events) |
		(events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET));
827 828 829 830
}

int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
831
	struct io_poll_update *upd = io_kiocb_to_cmd(req, struct io_poll_update);
832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
	u32 flags;

	if (sqe->buf_index || sqe->splice_fd_in)
		return -EINVAL;
	flags = READ_ONCE(sqe->len);
	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
		      IORING_POLL_ADD_MULTI))
		return -EINVAL;
	/* meaningless without update */
	if (flags == IORING_POLL_ADD_MULTI)
		return -EINVAL;

	upd->old_user_data = READ_ONCE(sqe->addr);
	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;

	upd->new_user_data = READ_ONCE(sqe->off);
	if (!upd->update_user_data && upd->new_user_data)
		return -EINVAL;
	if (upd->update_events)
		upd->events = io_poll_parse_events(sqe, flags);
	else if (sqe->poll32_events)
		return -EINVAL;

	return 0;
}

int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
861
	struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
862 863 864 865 866
	u32 flags;

	if (sqe->buf_index || sqe->off || sqe->addr)
		return -EINVAL;
	flags = READ_ONCE(sqe->len);
867
	if (flags & ~IORING_POLL_ADD_MULTI)
868 869 870 871 872 873 874 875 876 877
		return -EINVAL;
	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
		return -EINVAL;

	poll->events = io_poll_parse_events(sqe, flags);
	return 0;
}

int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
{
878
	struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
879 880 881 882 883
	struct io_poll_table ipt;
	int ret;

	ipt.pt._qproc = io_poll_queue_proc;

884 885 886 887
	/*
	 * If sqpoll or single issuer, there is no contention for ->uring_lock
	 * and we'll end up holding it in tw handlers anyway.
	 */
888
	if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER))
889 890
		req->flags |= REQ_F_HASH_LOCKED;

891
	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
892
	if (ret > 0) {
893
		io_req_set_res(req, ipt.result_mask, 0);
894 895
		return IOU_OK;
	}
896
	return ret ?: IOU_ISSUE_SKIP_COMPLETE;
897 898 899 900
}

int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
{
901
	struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
902 903
	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
	struct io_ring_ctx *ctx = req->ctx;
904
	struct io_hash_bucket *bucket;
905 906 907 908
	struct io_kiocb *preq;
	int ret2, ret = 0;
	bool locked;

909
	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
910
	ret2 = io_poll_disarm(preq);
911 912
	if (bucket)
		spin_unlock(&bucket->lock);
913 914 915 916
	if (!ret2)
		goto found;
	if (ret2 != -ENOENT) {
		ret = ret2;
917 918
		goto out;
	}
919 920 921 922 923 924 925 926 927

	io_ring_submit_lock(ctx, issue_flags);
	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket);
	ret2 = io_poll_disarm(preq);
	if (bucket)
		spin_unlock(&bucket->lock);
	io_ring_submit_unlock(ctx, issue_flags);
	if (ret2) {
		ret = ret2;
928 929 930
		goto out;
	}

931
found:
932 933 934 935 936
	if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) {
		ret = -EFAULT;
		goto out;
	}

937 938 939
	if (poll_update->update_events || poll_update->update_user_data) {
		/* only mask one event flags, keep behavior flags */
		if (poll_update->update_events) {
940
			struct io_poll *poll = io_kiocb_to_cmd(preq, struct io_poll);
941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967

			poll->events &= ~0xffff;
			poll->events |= poll_update->events & 0xffff;
			poll->events |= IO_POLL_UNMASK;
		}
		if (poll_update->update_user_data)
			preq->cqe.user_data = poll_update->new_user_data;

		ret2 = io_poll_add(preq, issue_flags);
		/* successfully updated, don't complete poll request */
		if (!ret2 || ret2 == -EIOCBQUEUED)
			goto out;
	}

	req_set_fail(preq);
	io_req_set_res(preq, -ECANCELED, 0);
	locked = !(issue_flags & IO_URING_F_UNLOCKED);
	io_req_task_complete(preq, &locked);
out:
	if (ret < 0) {
		req_set_fail(req);
		return ret;
	}
	/* complete update request, we're done with it */
	io_req_set_res(req, ret, 0);
	return IOU_OK;
}
968

969
void io_apoll_cache_free(struct io_cache_entry *entry)
970
{
971
	kfree(container_of(entry, struct async_poll, cache));
972
}