diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0efe4d784358284cf31e165e37f06f70c06e49f2..00689c12f6abb3f4b9c39f7d95905f843d7570dd 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -58,7 +58,7 @@ struct io_uring_task { struct xarray xa; struct wait_queue_head wait; - atomic_t in_idle; + atomic_t in_cancel; atomic_t inflight_tracked; struct percpu_counter inflight; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1df68da89f998660d54bc4999430f6219000b8a2..fd1cc35a1c00b67cc02fb8cf1f4421f933c43419 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -719,7 +719,7 @@ static void io_put_task_remote(struct task_struct *task, int nr) struct io_uring_task *tctx = task->io_uring; percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) + if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); put_task_struct_many(task, nr); } @@ -1258,8 +1258,8 @@ void tctx_task_work(struct callback_head *cb) ctx_flush_and_put(ctx, &uring_locked); - /* relaxed read is enough as only the task itself sets ->in_idle */ - if (unlikely(atomic_read(&tctx->in_idle))) + /* relaxed read is enough as only the task itself sets ->in_cancel */ + if (unlikely(atomic_read(&tctx->in_cancel))) io_uring_drop_tctx_refs(current); trace_io_uring_task_work_run(tctx, count, loops); @@ -1285,17 +1285,15 @@ static void io_req_local_work_add(struct io_kiocb *req) percpu_ref_get(&ctx->refs); - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { - percpu_ref_put(&ctx->refs); - return; - } + if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) + goto put_ref; + /* needed for the following wake up */ smp_mb__after_atomic(); - if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { + if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { io_move_task_work_from_local(ctx); - percpu_ref_put(&ctx->refs); - return; + goto put_ref; } if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) @@ -1305,6 +1303,8 @@ static void io_req_local_work_add(struct io_kiocb *req) if (READ_ONCE(ctx->cq_waiting)) wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); + +put_ref: percpu_ref_put(&ctx->refs); } @@ -1777,7 +1777,7 @@ int io_req_prep_async(struct io_kiocb *req) const struct io_issue_def *def = &io_issue_defs[req->opcode]; /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) req->file = io_file_get_normal(req, req->cqe.fd); if (!cdef->prep_async) return 0; @@ -2937,12 +2937,12 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) work = container_of(cb, struct io_tctx_exit, task_work); /* - * When @in_idle, we're in cancellation and it's racy to remove the + * When @in_cancel, we're in cancellation and it's racy to remove the * node. It'll be removed by the end of cancellation, just ignore it. * tctx can be NULL if the queueing of this task_work raced with * work cancelation off the exec path. */ - if (tctx && !atomic_read(&tctx->in_idle)) + if (tctx && !atomic_read(&tctx->in_cancel)) io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } @@ -3210,7 +3210,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); - atomic_inc(&tctx->in_idle); + atomic_inc(&tctx->in_cancel); do { bool loop = false; @@ -3261,9 +3261,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (cancel_all) { /* * We shouldn't run task_works after cancel, so just leave - * ->in_idle set for normal exit. + * ->in_cancel set for normal exit. */ - atomic_dec(&tctx->in_idle); + atomic_dec(&tctx->in_cancel); /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 4a6401080c1f88a74d76c4ce88bd15fee40f2528..3002dc827195916b160fce8376de009feb86f3c5 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -505,7 +505,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } pages = io_pin_pages(reg.ring_addr, - struct_size(br, bufs, reg.ring_entries), + flex_array_size(br, bufs, reg.ring_entries), &nr_pages); if (IS_ERR(pages)) { kfree(free_bl); diff --git a/io_uring/net.c b/io_uring/net.c index cbd4b725f58c98e5bc5bf88d5707db5c8302e071..b7f190ca528e6e259eb2b072d7a16aaba98848cb 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -567,7 +567,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~(RECVMSG_FLAGS)) return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags); if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) diff --git a/io_uring/poll.c b/io_uring/poll.c index 8339a92b451079b993359fee37aef449fba52d21..795facbd0e9f174ab4af7bbd1444b4eaedeb29da 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -51,6 +51,9 @@ struct io_poll_table { #define IO_WQE_F_DOUBLE 1 +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key); + static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; @@ -164,15 +167,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) } } -static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, - wait_queue_func_t wake_func) +static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) { poll->head = NULL; #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) /* mask in events that we always want/need */ poll->events = events | IO_POLL_UNMASK; INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); } static inline void io_poll_remove_entry(struct io_poll *poll) @@ -508,7 +510,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; - io_init_poll_iocb(poll, first->events, first->wait.func); + io_init_poll_iocb(poll, first->events); if (!io_poll_double_prepare(req)) { /* the request is completing, just back off */ kfree(poll); @@ -569,7 +571,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, INIT_HLIST_NODE(&req->hash_node); req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - io_init_poll_iocb(poll, mask, io_poll_wake); + io_init_poll_iocb(poll, mask); poll->file = req->file; req->apoll_events = poll->events; @@ -650,6 +652,14 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } +/* + * We can't reliably detect loops in repeated poll triggers and issue + * subsequently failing. But rather than fail these immediately, allow a + * certain amount of retries before we give up. Given that this condition + * should _rarely_ trigger even once, we should be fine with a larger value. + */ +#define APOLL_MAX_RETRY 128 + static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { @@ -665,14 +675,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (entry == NULL) goto alloc_apoll; apoll = container_of(entry, struct async_poll, cache); + apoll->poll.retries = APOLL_MAX_RETRY; } else { alloc_apoll: apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return NULL; + apoll->poll.retries = APOLL_MAX_RETRY; } apoll->double_poll = NULL; req->apoll = apoll; + if (unlikely(!--apoll->poll.retries)) + return NULL; return apoll; } @@ -694,8 +708,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_ABORTED; if (!file_can_poll(req->file)) return IO_APOLL_ABORTED; - if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) - return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; diff --git a/io_uring/poll.h b/io_uring/poll.h index 5f3bae50fc81a03e2c79c3d2a15652a67c39a98f..b2393b403a2c21014d5c648cc244a707748cb02a 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -12,6 +12,7 @@ struct io_poll { struct file *file; struct wait_queue_head *head; __poll_t events; + int retries; struct wait_queue_entry wait; }; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a59fc02de5983c4f789e9cdfea3a1376f578ebe6..056f40946ff68ff8d2d72781c5bdc9e3e303cea4 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1162,14 +1162,17 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { + struct file *file = vmas[0]->vm_file; + /* don't support file backed memory */ for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma_is_shmem(vma)) + if (vmas[i]->vm_file != file) { + ret = -EINVAL; + break; + } + if (!file) continue; - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { + if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { ret = -EOPNOTSUPP; break; } @@ -1207,6 +1210,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unsigned long off; size_t size; int ret, nr_pages, i; + struct folio *folio = NULL; *pimu = ctx->dummy_ubuf; if (!iov->iov_base) @@ -1221,6 +1225,21 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } + /* If it's a huge page, try to coalesce them into a single bvec entry */ + if (nr_pages > 1) { + folio = page_folio(pages[0]); + for (i = 1; i < nr_pages; i++) { + if (page_folio(pages[i]) != folio) { + folio = NULL; + break; + } + } + if (folio) { + folio_put_refs(folio, nr_pages - 1); + nr_pages = 1; + } + } + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) goto done; @@ -1233,6 +1252,17 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; + /* store original address for later verification */ + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->nr_bvecs = nr_pages; + *pimu = imu; + ret = 0; + + if (folio) { + bvec_set_page(&imu->bvec[0], pages[0], size, off); + goto done; + } for (i = 0; i < nr_pages; i++) { size_t vec_len; @@ -1241,12 +1271,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, off = 0; size -= vec_len; } - /* store original address for later verification */ - imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; - imu->nr_bvecs = nr_pages; - *pimu = imu; - ret = 0; done: if (ret) kvfree(imu); @@ -1335,7 +1359,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, return -EFAULT; /* - * May not be a start of buffer, set size appropriately + * Might not be a start of buffer, set size appropriately * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; @@ -1361,7 +1385,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); + /* + * Note, huge pages buffers consists of one large + * bvec entry and should always go this way. The other + * branch doesn't expect non PAGE_SIZE'd chunks. + */ + iter->bvec = bvec; + iter->nr_segs = bvec->bv_len; + iter->count -= offset; + iter->iov_offset = offset; } else { unsigned long seg_skip; diff --git a/io_uring/slist.h b/io_uring/slist.h index f27601fa46607ba9ad431f6bc5ca57df24342e18..7c198a40d5f11d799e49441aefc9034cc71169f5 100644 --- a/io_uring/slist.h +++ b/io_uring/slist.h @@ -27,28 +27,6 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, list->last = node; } -/** - * wq_list_merge - merge the second list to the first one. - * @list0: the first list - * @list1: the second list - * Return the first node after mergence. - */ -static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, - struct io_wq_work_list *list1) -{ - struct io_wq_work_node *ret; - - if (!list0->first) { - ret = list1->first; - } else { - ret = list0->first; - list0->last->next = list1->first; - } - INIT_WQ_LIST(list0); - INIT_WQ_LIST(list1); - return ret; -} - static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 4324b1cf1f6afaf5e6afc740f8e1ee03e2afa719..3a8d1dd97e1b4bfd4f5476d809756b64ae1d5518 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -83,7 +83,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); - atomic_set(&tctx->in_idle, 0); + atomic_set(&tctx->in_cancel, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; init_llist_head(&tctx->task_list);