blk-mq: replace timeout synchronization with a RCU and generation based scheme

author Tejun Heo <tj@kernel.org>

Tue, 9 Jan 2018 16:29:48 +0000 (08:29 -0800)

committer Jens Axboe <axboe@kernel.dk>

Tue, 9 Jan 2018 16:31:15 +0000 (09:31 -0700)
author Tejun Heo <tj@kernel.org>
Tue, 9 Jan 2018 16:29:48 +0000 (08:29 -0800)
committer Jens Axboe <axboe@kernel.dk>
Tue, 9 Jan 2018 16:31:15 +0000 (09:31 -0700)
diff --git a/block/blk-core.c b/block/blk-core.c

index 2e0d041e2dafb71f9dd833b73f9d5c4a9a4f4045..f843ae4f858de3b39da3d5bf5a1d55355b57e723 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
         rq->start_time = jiffies;
         set_start_time_ns(rq);
         rq->part = NULL;
+       seqcount_init(&rq->gstate_seq);
+       u64_stats_init(&rq->aborted_gstate_sync);
  }
  EXPORT_SYMBOL(blk_rq_init);
  
diff --git a/block/blk-mq.c b/block/blk-mq.c

index f5e57c80a82be7406c1b9a6b4becf7fd52246c9f..156203876c8c894c208bbdf071a872df84f59fc5 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq)
         if (blk_rq_rl(rq))
                 blk_put_rl(blk_rq_rl(rq));
  
+       blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
         clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
         if (rq->tag != -1)
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq)
         bool shared = false;
         int cpu;
  
+       WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+
         if (rq->internal_tag != -1)
                 blk_mq_sched_completed_request(rq);
         if (rq->rq_flags & RQF_STATS) {
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
                 *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
  }
  
+static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+{
+       unsigned long flags;
+
+       /*
+        * blk_mq_rq_aborted_gstate() is used from the completion path and
+        * can thus be called from irq context.  u64_stats_fetch in the
+        * middle of update on the same CPU leads to lockup.  Disable irq
+        * while updating.
+        */
+       local_irq_save(flags);
+       u64_stats_update_begin(&rq->aborted_gstate_sync);
+       rq->aborted_gstate = gstate;
+       u64_stats_update_end(&rq->aborted_gstate_sync);
+       local_irq_restore(flags);
+}
+
+static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+{
+       unsigned int start;
+       u64 aborted_gstate;
+
+       do {
+               start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+               aborted_gstate = rq->aborted_gstate;
+       } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+
+       return aborted_gstate;
+}
+
  /**
   * blk_mq_complete_request - end I/O on a request
   * @rq:                the request being processed
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq)
         if (unlikely(blk_should_fake_timeout(q)))
                 return;
  
+       /*
+        * If @rq->aborted_gstate equals the current instance, timeout is
+        * claiming @rq and we lost.  This is synchronized through
+        * hctx_lock().  See blk_mq_timeout_work() for details.
+        *
+        * Completion path never blocks and we can directly use RCU here
+        * instead of hctx_lock() which can be either RCU or SRCU.
+        * However, that would complicate paths which want to synchronize
+        * against us.  Let stay in sync with the issue path so that
+        * hctx_lock() covers both issue and completion paths.
+        */
         hctx_lock(hctx, &srcu_idx);
-       if (!blk_mark_rq_complete(rq))
+       if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
+           !blk_mark_rq_complete(rq))
                 __blk_mq_complete_request(rq);
         hctx_unlock(hctx, srcu_idx);
  }
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq)
                 wbt_issue(q->rq_wb, &rq->issue_stat);
         }
  
-       blk_add_timer(rq);
-
+       WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
         WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
  
         /*
-        * Mark us as started and clear complete. Complete might have been
-        * set if requeue raced with timeout, which then marked it as
-        * complete. So be sure to clear complete again when we start
-        * the request, otherwise we'll ignore the completion event.
+        * Mark @rq in-flight which also advances the generation number,
+        * and register for timeout.  Protect with a seqcount to allow the
+        * timeout path to read both @rq->gstate and @rq->deadline
+        * coherently.
          *
-        * Ensure that ->deadline is visible before we set STARTED, such that
-        * blk_mq_check_expired() is guaranteed to observe our ->deadline when
-        * it observes STARTED.
+        * This is the only place where a request is marked in-flight.  If
+        * the timeout path reads an in-flight @rq->gstate, the
+        * @rq->deadline it reads together under @rq->gstate_seq is
+        * guaranteed to be the matching one.
          */
-       smp_wmb();
+       preempt_disable();
+       write_seqcount_begin(&rq->gstate_seq);
+
+       blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+       blk_add_timer(rq);
+
+       write_seqcount_end(&rq->gstate_seq);
+       preempt_enable();
+
         set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-       if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
-               /*
-                * Coherence order guarantees these consecutive stores to a
-                * single variable propagate in the specified order. Thus the
-                * clear_bit() is ordered _after_ the set bit. See
-                * blk_mq_check_expired().
-                *
-                * (the bits must be part of the same byte for this to be
-                * true).
-                */
+       if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
                 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-       }
  
         if (q->dma_drain_size && blk_rq_bytes(rq)) {
                 /*
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq)
         blk_mq_sched_requeue_request(rq);
  
         if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+               blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
                 if (q->dma_drain_size && blk_rq_bytes(rq))
                         rq->nr_phys_segments--;
         }
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
  struct blk_mq_timeout_data {
         unsigned long next;
         unsigned int next_set;
+       unsigned int nr_expired;
  };
  
  void blk_mq_rq_timed_out(struct request *req, bool reserved)
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
                 __blk_mq_complete_request(req);
                 break;
         case BLK_EH_RESET_TIMER:
+               /*
+                * As nothing prevents from completion happening while
+                * ->aborted_gstate is set, this may lead to ignored
+                * completions and further spurious timeouts.
+                */
+               blk_mq_rq_update_aborted_gstate(req, 0);
                 blk_add_timer(req);
                 blk_clear_rq_complete(req);
                 break;
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                 struct request *rq, void *priv, bool reserved)
  {
         struct blk_mq_timeout_data *data = priv;
-       unsigned long deadline;
+       unsigned long gstate, deadline;
+       int start;
+
+       might_sleep();
  
         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
                 return;
  
-       /*
-        * Ensures that if we see STARTED we must also see our
-        * up-to-date deadline, see blk_mq_start_request().
-        */
-       smp_rmb();
-
-       deadline = READ_ONCE(rq->deadline);
+       /* read coherent snapshots of @rq->state_gen and @rq->deadline */
+       while (true) {
+               start = read_seqcount_begin(&rq->gstate_seq);
+               gstate = READ_ONCE(rq->gstate);
+               deadline = rq->deadline;
+               if (!read_seqcount_retry(&rq->gstate_seq, start))
+                       break;
+               cond_resched();
+       }
  
-       /*
-        * The rq being checked may have been freed and reallocated
-        * out already here, we avoid this race by checking rq->deadline
-        * and REQ_ATOM_COMPLETE flag together:
-        *
-        * - if rq->deadline is observed as new value because of
-        *   reusing, the rq won't be timed out because of timing.
-        * - if rq->deadline is observed as previous value,
-        *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
-        *   because we put a barrier between setting rq->deadline
-        *   and clearing the flag in blk_mq_start_request(), so
-        *   this rq won't be timed out too.
-        */
-       if (time_after_eq(jiffies, deadline)) {
-               if (!blk_mark_rq_complete(rq)) {
-                       /*
-                        * Again coherence order ensures that consecutive reads
-                        * from the same variable must be in that order. This
-                        * ensures that if we see COMPLETE clear, we must then
-                        * see STARTED set and we'll ignore this timeout.
-                        *
-                        * (There's also the MB implied by the test_and_clear())
-                        */
-                       blk_mq_rq_timed_out(rq, reserved);
-               }
+       /* if in-flight && overdue, mark for abortion */
+       if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+           time_after_eq(jiffies, deadline)) {
+               blk_mq_rq_update_aborted_gstate(rq, gstate);
+               data->nr_expired++;
+               hctx->nr_expired++;
         } else if (!data->next_set || time_after(data->next, deadline)) {
                 data->next = deadline;
                 data->next_set = 1;
         }
  }
  
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+               struct request *rq, void *priv, bool reserved)
+{
+       /*
+        * We marked @rq->aborted_gstate and waited for RCU.  If there were
+        * completions that we lost to, they would have finished and
+        * updated @rq->gstate by now; otherwise, the completion path is
+        * now guaranteed to see @rq->aborted_gstate and yield.  If
+        * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+        */
+       if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
+           !blk_mark_rq_complete(rq))
+               blk_mq_rq_timed_out(rq, reserved);
+}
+
  static void blk_mq_timeout_work(struct work_struct *work)
  {
         struct request_queue *q =
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
         struct blk_mq_timeout_data data = {
                 .next           = 0,
                 .next_set       = 0,
+               .nr_expired     = 0,
         };
+       struct blk_mq_hw_ctx *hctx;
         int i;
  
         /* A deadlock might occur if a request is stuck requiring a
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work)
         if (!percpu_ref_tryget(&q->q_usage_counter))
                 return;
  
+       /* scan for the expired ones and set their ->aborted_gstate */
         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
  
+       if (data.nr_expired) {
+               bool has_rcu = false;
+
+               /*
+                * Wait till everyone sees ->aborted_gstate.  The
+                * sequential waits for SRCUs aren't ideal.  If this ever
+                * becomes a problem, we can add per-hw_ctx rcu_head and
+                * wait in parallel.
+                */
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (!hctx->nr_expired)
+                               continue;
+
+                       if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+                               has_rcu = true;
+                       else
+                               synchronize_srcu(hctx->queue_rq_srcu);
+
+                       hctx->nr_expired = 0;
+               }
+               if (has_rcu)
+                       synchronize_rcu();
+
+               /* terminate the ones we won */
+               blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+       }
+
         if (data.next_set) {
                 data.next = blk_rq_timeout(round_jiffies_up(data.next));
                 mod_timer(&q->timeout, data.next);
         } else {
-               struct blk_mq_hw_ctx *hctx;
-
                 queue_for_each_hw_ctx(q, hctx, i) {
                         /* the hctx may be unmapped, so check it here */
                         if (blk_mq_hw_queue_mapped(hctx))
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order)
         return (size_t)PAGE_SIZE << order;
  }
  
+static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+                              unsigned int hctx_idx, int node)
+{
+       int ret;
+
+       if (set->ops->init_request) {
+               ret = set->ops->init_request(set, rq, hctx_idx, node);
+               if (ret)
+                       return ret;
+       }
+
+       seqcount_init(&rq->gstate_seq);
+       u64_stats_init(&rq->aborted_gstate_sync);
+       return 0;
+}
+
  int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                      unsigned int hctx_idx, unsigned int depth)
  {
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                         struct request *rq = p;
  
                         tags->static_rqs[i] = rq;
-                       if (set->ops->init_request) {
-                               if (set->ops->init_request(set, rq, hctx_idx,
-                                               node)) {
-                                       tags->static_rqs[i] = NULL;
-                                       goto fail;
-                               }
+                       if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+                               tags->static_rqs[i] = NULL;
+                               goto fail;
                         }
  
                         p += rq_size;
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
         if (!hctx->fq)
                 goto sched_exit_hctx;
  
-       if (set->ops->init_request &&
-           set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
-                                  node))
+       if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
                 goto free_fq;
  
         if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
  
  static int __init blk_mq_init(void)
  {
-       /*
-        * See comment in block/blk.h rq_atomic_flags enum
-        */
-       BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
-                       (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
-
         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                 blk_mq_hctx_notify_dead);
         return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h

index 6c7c3ff5bf627d3e36a8e1bf1feca66ff00ac74d..cf01f6f8c73dc4558a0fab9833faa65b9467d414 100644 (file)
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,19 @@ struct blk_mq_ctx {
         struct kobject          kobj;
  } ____cacheline_aligned_in_smp;
  
+/*
+ * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
+ * and the upper bits the generation number.
+ */
+enum mq_rq_state {
+       MQ_RQ_IDLE              = 0,
+       MQ_RQ_IN_FLIGHT         = 1,
+
+       MQ_RQ_STATE_BITS        = 2,
+       MQ_RQ_STATE_MASK        = (1 << MQ_RQ_STATE_BITS) - 1,
+       MQ_RQ_GEN_INC           = 1 << MQ_RQ_STATE_BITS,
+};
+
  void blk_mq_freeze_queue(struct request_queue *q);
  void blk_mq_free_queue(struct request_queue *q);
  int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -85,6 +98,39 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
  
  void blk_mq_release(struct request_queue *q);
  
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline int blk_mq_rq_state(struct request *rq)
+{
+       return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
+}
+
+/**
+ * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
+ * @rq: target request.
+ * @state: new state to set.
+ *
+ * Set @rq's state to @state.  The caller is responsible for ensuring that
+ * there are no other updaters.  A request can transition into IN_FLIGHT
+ * only from IDLE and doing so increments the generation number.
+ */
+static inline void blk_mq_rq_update_state(struct request *rq,
+                                         enum mq_rq_state state)
+{
+       u64 old_val = READ_ONCE(rq->gstate);
+       u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
+
+       if (state == MQ_RQ_IN_FLIGHT) {
+               WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
+               new_val += MQ_RQ_GEN_INC;
+       }
+
+       /* avoid exposing interim values */
+       WRITE_ONCE(rq->gstate, new_val);
+}
+
  static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
                                            unsigned int cpu)
  {
diff --git a/block/blk-timeout.c b/block/blk-timeout.c

index 764ecf9aeb30516fbfb0401fca1ad116b9815866..6427be7ac363759f4331e7592888a45071b08467 100644 (file)
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -208,7 +208,7 @@ void blk_add_timer(struct request *req)
         if (!req->timeout)
                 req->timeout = q->rq_timeout;
  
-       WRITE_ONCE(req->deadline, jiffies + req->timeout);
+       req->deadline = jiffies + req->timeout;
  
         /*
          * Only the non-mq case needs to add the request to a protected list.
diff --git a/block/blk.h b/block/blk.h

index 3f1446937aece26f38ceb66cf4e3d159a23df871..9cb2739edb6af9be99ead3b3459e62b5182ca1ee 100644 (file)
--- a/block/blk.h
+++ b/block/blk.h
@@ -123,12 +123,6 @@ void blk_account_io_done(struct request *req);
   * Internal atomic flags for request handling
   */
  enum rq_atomic_flags {
-       /*
-        * Keep these two bits first - not because we depend on the
-        * value of them, but we do depend on them being in the same
-        * byte of storage to ensure ordering on writes. Keeping them
-        * first will achieve that nicely.
-        */
         REQ_ATOM_COMPLETE = 0,
         REQ_ATOM_STARTED,
  
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h

index 95c9a5c862e2545b26922b3cbb2103200a29a888..460798dbac1fd40c981051cc8a88d4eb8cfdba10 100644 (file)
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
         unsigned int            queue_num;
  
         atomic_t                nr_active;
+       unsigned int            nr_expired;
  
         struct hlist_node       cpuhp_dead;
         struct kobject          kobj;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 46e606f5b44b3b7e5ec2cab1a476d3f46380ff67..ae563d01b29d7eb1f20b1f442fe0c31b57001b94 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,8 @@
  #include <linux/percpu-refcount.h>
  #include <linux/scatterlist.h>
  #include <linux/blkzoned.h>
+#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
  
  struct module;
  struct scsi_ioctl_command;
@@ -230,6 +232,27 @@ struct request {
  
         unsigned short write_hint;
  
+       /*
+        * On blk-mq, the lower bits of ->gstate (generation number and
+        * state) carry the MQ_RQ_* state value and the upper bits the
+        * generation number which is monotonically incremented and used to
+        * distinguish the reuse instances.
+        *
+        * ->gstate_seq allows updates to ->gstate and other fields
+        * (currently ->deadline) during request start to be read
+        * atomically from the timeout path, so that it can operate on a
+        * coherent set of information.
+        */
+       seqcount_t gstate_seq;
+       u64 gstate;
+
+       /*
+        * ->aborted_gstate is used by the timeout to claim a specific
+        * recycle instance of this request.  See blk_mq_timeout_work().
+        */
+       struct u64_stats_sync aborted_gstate_sync;
+       u64 aborted_gstate;
+
         unsigned long deadline;
         struct list_head timeout_list;
author	Tejun Heo <tj@kernel.org>
	Tue, 9 Jan 2018 16:29:48 +0000 (08:29 -0800)
committer	Jens Axboe <axboe@kernel.dk>
	Tue, 9 Jan 2018 16:31:15 +0000 (09:31 -0700)
block/blk-core.c		patch \| blob \| history
block/blk-mq.c		patch \| blob \| history
block/blk-mq.h		patch \| blob \| history
block/blk-timeout.c		patch \| blob \| history
block/blk.h		patch \| blob \| history
include/linux/blk-mq.h		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history