From 007d7c0ae64d3ed3d006f75b1f1e2ff839bf03dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Nov 2019 11:54:28 +0300 Subject: [PATCH] io_uring: drain next sqe instead of shadowing to #26323578 commit 1b4a51b6d03d21f55effbcf609ba5526d87d9e9d upstream. There's an issue with the shadow drain logic in that we drop the completion lock after deciding to defer a request, then re-grab it later and assume that the state is still the same. In the mean time, someone else completing a request could have found and issued it. This can cause a stall in the queue, by having a shadow request inserted that nobody is going to drain. Additionally, if we fail allocating the shadow request, we simply ignore the drain. Instead of using a shadow request, defer the next request/link instead. This also has the following advantages: - removes semi-duplicated code - doesn't allocate memory for shadows - works better if only the head marked for drain - doesn't need complex synchronisation On the flip side, it removes the shadow->seq == last_drain_in_in_link->seq optimization. That shouldn't be a common case, and can always be added back, if needed. Fixes: 4fe2c963154c ("io_uring: add support for link with drain") Cc: Jackie Liu Reported-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi Acked-by: Xiaoguang Wang --- fs/io_uring.c | 86 +++++++++++---------------------------------------- 1 file changed, 18 insertions(+), 68 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3d7b611f2cc1..acefe144eba2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -186,6 +186,7 @@ struct io_ring_ctx { bool compat; bool account_mem; bool cq_overflow_flushed; + bool drain_next; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -348,7 +349,7 @@ struct io_kiocb { #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ @@ -621,11 +622,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { - if (req->flags & REQ_F_SHADOW_DRAIN) { - /* Just for drain, free it. */ - __io_free_req(req); - continue; - } req->flags |= REQ_F_IO_DRAINED; io_queue_async_work(req); } @@ -2975,6 +2971,12 @@ static void io_queue_sqe(struct io_kiocb *req) { int ret; + if (unlikely(req->ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = false; + } + req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); + ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { @@ -2987,57 +2989,16 @@ static void io_queue_sqe(struct io_kiocb *req) __io_queue_sqe(req); } -static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) +static inline void io_queue_link_head(struct io_kiocb *req) { - int ret; - int need_submit = false; - struct io_ring_ctx *ctx = req->ctx; - if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - ret = -ECANCELED; - goto err; - } - if (!shadow) { + io_cqring_add_event(req, -ECANCELED); + io_double_put_req(req); + } else io_queue_sqe(req); - return; - } - - /* - * Mark the first IO in link list as DRAIN, let all the following - * IOs enter the defer list. all IO needs to be completed before link - * list. - */ - req->flags |= REQ_F_IO_DRAIN; - ret = io_req_defer(req); - if (ret) { - if (ret != -EIOCBQUEUED) { -err: - io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; - io_double_put_req(req); - if (shadow) - __io_free_req(shadow); - return; - } - } else { - /* - * If ret == 0 means that all IOs in front of link io are - * running done. let's queue link head. - */ - need_submit = true; - } - - /* Insert shadow req to defer_list, blocking next IOs */ - spin_lock_irq(&ctx->completion_lock); - trace_io_uring_defer(ctx, shadow, true); - list_add_tail(&shadow->list, &ctx->defer_list); - spin_unlock_irq(&ctx->completion_lock); - - if (need_submit) - __io_queue_sqe(req); } + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, @@ -3074,6 +3035,9 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb *prev = *link; struct io_uring_sqe *sqe_copy; + if (s->sqe->flags & IOSQE_IO_DRAIN) + (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; + if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { ret = io_timeout_setup(req); /* common setup allows offset being set, we don't */ @@ -3192,7 +3156,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; - struct io_kiocb *shadow_req = NULL; int i, submitted = 0; bool mm_fault = false; @@ -3231,18 +3194,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, sqe_flags = req->submit.sqe->flags; - if (link && (sqe_flags & IOSQE_IO_DRAIN)) { - if (!shadow_req) { - shadow_req = io_get_req(ctx, NULL); - if (unlikely(!shadow_req)) - goto out; - shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); - refcount_dec(&shadow_req->refs); - } - shadow_req->sequence = req->submit.sequence; - } - -out: req->submit.ring_file = ring_file; req->submit.ring_fd = ring_fd; req->submit.has_user = *mm != NULL; @@ -3258,14 +3209,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, * that's the end of the chain. Submit the previous link. */ if (!(sqe_flags & IOSQE_IO_LINK) && link) { - io_queue_link_head(link, shadow_req); + io_queue_link_head(link); link = NULL; - shadow_req = NULL; } } if (link) - io_queue_link_head(link, shadow_req); + io_queue_link_head(link); if (statep) io_submit_state_end(&state); -- GitLab