io_uring: drain next sqe instead of shadowing
authorPavel Begunkov <asml.silence@gmail.com>
Thu, 21 Nov 2019 08:54:28 +0000 (11:54 +0300)
committerJens Axboe <axboe@kernel.dk>
Tue, 26 Nov 2019 02:56:10 +0000 (19:56 -0700)
There's an issue with the shadow drain logic in that we drop the
completion lock after deciding to defer a request, then re-grab it later
and assume that the state is still the same. In the mean time, someone
else completing a request could have found and issued it. This can cause
a stall in the queue, by having a shadow request inserted that nobody is
going to drain.

Additionally, if we fail allocating the shadow request, we simply ignore
the drain.

Instead of using a shadow request, defer the next request/link instead.
This also has the following advantages:

- removes semi-duplicated code
- doesn't allocate memory for shadows
- works better if only the head marked for drain
- doesn't need complex synchronisation

On the flip side, it removes the shadow->seq ==
last_drain_in_in_link->seq optimization. That shouldn't be a common
case, and can always be added back, if needed.

Fixes: 4fe2c963154c ("io_uring: add support for link with drain")
Cc: Jackie Liu <liuyun01@kylinos.cn>
Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
fs/io_uring.c

index 27fefb52910c7f3262fefc710d26b6b05d146277..ca980c5878e97da876faa656ff8b96dea7355b87 100644 (file)
@@ -186,6 +186,7 @@ struct io_ring_ctx {
                bool                    compat;
                bool                    account_mem;
                bool                    cq_overflow_flushed;
+               bool                    drain_next;
 
                /*
                 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -346,7 +347,7 @@ struct io_kiocb {
 #define REQ_F_LINK             64      /* linked sqes */
 #define REQ_F_LINK_TIMEOUT     128     /* has linked timeout */
 #define REQ_F_FAIL_LINK                256     /* fail rest of links */
-#define REQ_F_SHADOW_DRAIN     512     /* link-drain shadow req */
+#define REQ_F_DRAIN_LINK       512     /* link should be fully drained */
 #define REQ_F_TIMEOUT          1024    /* timeout request */
 #define REQ_F_ISREG            2048    /* regular file */
 #define REQ_F_MUST_PUNT                4096    /* must be punted even for NONBLOCK */
@@ -620,11 +621,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
        __io_commit_cqring(ctx);
 
        while ((req = io_get_deferred_req(ctx)) != NULL) {
-               if (req->flags & REQ_F_SHADOW_DRAIN) {
-                       /* Just for drain, free it. */
-                       __io_free_req(req);
-                       continue;
-               }
                req->flags |= REQ_F_IO_DRAINED;
                io_queue_async_work(req);
        }
@@ -2973,6 +2969,12 @@ static void io_queue_sqe(struct io_kiocb *req)
 {
        int ret;
 
+       if (unlikely(req->ctx->drain_next)) {
+               req->flags |= REQ_F_IO_DRAIN;
+               req->ctx->drain_next = false;
+       }
+       req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
+
        ret = io_req_defer(req);
        if (ret) {
                if (ret != -EIOCBQUEUED) {
@@ -2985,57 +2987,16 @@ static void io_queue_sqe(struct io_kiocb *req)
                __io_queue_sqe(req);
 }
 
-static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
+static inline void io_queue_link_head(struct io_kiocb *req)
 {
-       int ret;
-       int need_submit = false;
-       struct io_ring_ctx *ctx = req->ctx;
-
        if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
-               ret = -ECANCELED;
-               goto err;
-       }
-       if (!shadow) {
+               io_cqring_add_event(req, -ECANCELED);
+               io_double_put_req(req);
+       } else
                io_queue_sqe(req);
-               return;
-       }
-
-       /*
-        * Mark the first IO in link list as DRAIN, let all the following
-        * IOs enter the defer list. all IO needs to be completed before link
-        * list.
-        */
-       req->flags |= REQ_F_IO_DRAIN;
-       ret = io_req_defer(req);
-       if (ret) {
-               if (ret != -EIOCBQUEUED) {
-err:
-                       io_cqring_add_event(req, ret);
-                       if (req->flags & REQ_F_LINK)
-                               req->flags |= REQ_F_FAIL_LINK;
-                       io_double_put_req(req);
-                       if (shadow)
-                               __io_free_req(shadow);
-                       return;
-               }
-       } else {
-               /*
-                * If ret == 0 means that all IOs in front of link io are
-                * running done. let's queue link head.
-                */
-               need_submit = true;
-       }
-
-       /* Insert shadow req to defer_list, blocking next IOs */
-       spin_lock_irq(&ctx->completion_lock);
-       trace_io_uring_defer(ctx, shadow, true);
-       list_add_tail(&shadow->list, &ctx->defer_list);
-       spin_unlock_irq(&ctx->completion_lock);
-
-       if (need_submit)
-               __io_queue_sqe(req);
 }
 
+
 #define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
 
 static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
@@ -3072,6 +3033,9 @@ err_req:
                struct io_kiocb *prev = *link;
                struct io_uring_sqe *sqe_copy;
 
+               if (s->sqe->flags & IOSQE_IO_DRAIN)
+                       (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
+
                if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
                        ret = io_timeout_setup(req);
                        /* common setup allows offset being set, we don't */
@@ -3190,7 +3154,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 {
        struct io_submit_state state, *statep = NULL;
        struct io_kiocb *link = NULL;
-       struct io_kiocb *shadow_req = NULL;
        int i, submitted = 0;
        bool mm_fault = false;
 
@@ -3229,18 +3192,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 
                sqe_flags = req->submit.sqe->flags;
 
-               if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
-                       if (!shadow_req) {
-                               shadow_req = io_get_req(ctx, NULL);
-                               if (unlikely(!shadow_req))
-                                       goto out;
-                               shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
-                               refcount_dec(&shadow_req->refs);
-                       }
-                       shadow_req->sequence = req->submit.sequence;
-               }
-
-out:
                req->submit.ring_file = ring_file;
                req->submit.ring_fd = ring_fd;
                req->submit.has_user = *mm != NULL;
@@ -3256,14 +3207,13 @@ out:
                 * that's the end of the chain. Submit the previous link.
                 */
                if (!(sqe_flags & IOSQE_IO_LINK) && link) {
-                       io_queue_link_head(link, shadow_req);
+                       io_queue_link_head(link);
                        link = NULL;
-                       shadow_req = NULL;
                }
        }
 
        if (link)
-               io_queue_link_head(link, shadow_req);
+               io_queue_link_head(link);
        if (statep)
                io_submit_state_end(&state);