[BLOCK] Reimplement elevator switch
authorTejun Heo <htejun@gmail.com>
Fri, 28 Oct 2005 06:29:39 +0000 (08:29 +0200)
committerJens Axboe <axboe@nelson.home.kernel.dk>
Fri, 28 Oct 2005 06:48:12 +0000 (08:48 +0200)
 This patch reimplements elevator switch.  This patch assumes generic
dispatch queue patchset is applied.

 * Each request is tagged with REQ_ELVPRIV flag if it has its elevator
   private data set.
 * Requests which doesn't have REQ_ELVPRIV flag set never enter
   iosched.  They are always directly back inserted to dispatch queue.
   Of course, elevator_put_req_fn is called only for requests which
   have its REQ_ELVPRIV set.
 * Request queue maintains the current number of requests which have
   its elevator data set (elevator_set_req_fn called) in
   q->rq->elvpriv.
 * If a request queue has QUEUE_FLAG_BYPASS set, elevator private data
   is not allocated for new requests.

 To switch to another iosched, we set QUEUE_FLAG_BYPASS and wait until
elvpriv goes to zero; then, we attach the new iosched and clears
QUEUE_FLAG_BYPASS.  New implementation is much simpler and main code
paths are less cluttered, IMHO.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
drivers/block/elevator.c
drivers/block/ll_rw_blk.c
include/linux/blkdev.h

index af2388e73f614343a2ec6540085591f49084b0ee..272d939466211a73e0f97ef5f24b5a60e6d5586c 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
+#include <linux/delay.h>
 
 #include <asm/uaccess.h>
 
@@ -131,11 +132,7 @@ static int elevator_attach(request_queue_t *q, struct elevator_type *e,
        eq->ops = &e->ops;
        eq->elevator_type = e;
 
-       INIT_LIST_HEAD(&q->queue_head);
-       q->last_merge = NULL;
        q->elevator = eq;
-       q->end_sector = 0;
-       q->boundary_rq = NULL;
 
        if (eq->ops->elevator_init_fn)
                ret = eq->ops->elevator_init_fn(q, eq);
@@ -184,6 +181,12 @@ int elevator_init(request_queue_t *q, char *name)
        struct elevator_queue *eq;
        int ret = 0;
 
+       INIT_LIST_HEAD(&q->queue_head);
+       q->last_merge = NULL;
+       q->end_sector = 0;
+       q->boundary_rq = NULL;
+       q->max_back_kb = 0;
+
        elevator_setup_default();
 
        if (!name)
@@ -336,23 +339,14 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
                        q->end_sector = rq_end_sector(rq);
                        q->boundary_rq = rq;
                }
-       }
+       } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
+               where = ELEVATOR_INSERT_BACK;
 
        if (plug)
                blk_plug_device(q);
 
        rq->q = q;
 
-       if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
-               /*
-                * if drain is set, store the request "locally". when the drain
-                * is finished, the requests will be handed ordered to the io
-                * scheduler
-                */
-               list_add_tail(&rq->queuelist, &q->drain_list);
-               return;
-       }
-
        switch (where) {
        case ELEVATOR_INSERT_FRONT:
                rq->flags |= REQ_SOFTBARRIER;
@@ -659,25 +653,36 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
  * we don't free the old io scheduler, before we have allocated what we
  * need for the new one. this way we have a chance of going back to the old
- * one, if the new one fails init for some reason. we also do an intermediate
- * switch to noop to ensure safety with stack-allocated requests, since they
- * don't originate from the block layer allocator. noop is safe here, because
- * it never needs to touch the elevator itself for completion events. DRAIN
- * flags will make sure we don't touch it for additions either.
+ * one, if the new one fails init for some reason.
  */
 static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 {
-       elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
-       struct elevator_type *noop_elevator = NULL;
-       elevator_t *old_elevator;
+       elevator_t *old_elevator, *e;
 
+       /*
+        * Allocate new elevator
+        */
+       e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
        if (!e)
                goto error;
 
        /*
-        * first step, drain requests from the block freelist
+        * Turn on BYPASS and drain all requests w/ elevator private data
         */
-       blk_wait_queue_drained(q, 0);
+       spin_lock_irq(q->queue_lock);
+
+       set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+
+       while (q->elevator->ops->elevator_dispatch_fn(q, 1))
+               ;
+
+       while (q->rq.elvpriv) {
+               spin_unlock_irq(q->queue_lock);
+               msleep(100);
+               spin_lock_irq(q->queue_lock);
+       }
+
+       spin_unlock_irq(q->queue_lock);
 
        /*
         * unregister old elevator data
@@ -685,18 +690,6 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
        elv_unregister_queue(q);
        old_elevator = q->elevator;
 
-       /*
-        * next step, switch to noop since it uses no private rq structures
-        * and doesn't allocate any memory for anything. then wait for any
-        * non-fs requests in-flight
-        */
-       noop_elevator = elevator_get("noop");
-       spin_lock_irq(q->queue_lock);
-       elevator_attach(q, noop_elevator, e);
-       spin_unlock_irq(q->queue_lock);
-
-       blk_wait_queue_drained(q, 1);
-
        /*
         * attach and start new elevator
         */
@@ -707,11 +700,10 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
                goto fail_register;
 
        /*
-        * finally exit old elevator and start queue again
+        * finally exit old elevator and turn off BYPASS.
         */
        elevator_exit(old_elevator);
-       blk_finish_queue_drain(q);
-       elevator_put(noop_elevator);
+       clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
        return;
 
 fail_register:
@@ -720,13 +712,13 @@ fail_register:
         * one again (along with re-adding the sysfs dir)
         */
        elevator_exit(e);
+       e = NULL;
 fail:
        q->elevator = old_elevator;
        elv_register_queue(q);
-       blk_finish_queue_drain(q);
+       clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+       kfree(e);
 error:
-       if (noop_elevator)
-               elevator_put(noop_elevator);
        elevator_put(new_e);
        printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
 }
index d2a66fd309c3df6c323dbb00156450679e623227..f7c9931cb3803e64590d6998e9645ffff153b6e2 100644 (file)
@@ -263,8 +263,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
        blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
        blk_queue_activity_fn(q, NULL, NULL);
-
-       INIT_LIST_HEAD(&q->drain_list);
 }
 
 EXPORT_SYMBOL(blk_queue_make_request);
@@ -1050,6 +1048,7 @@ static char *rq_flags[] = {
        "REQ_STARTED",
        "REQ_DONTPREP",
        "REQ_QUEUED",
+       "REQ_ELVPRIV",
        "REQ_PC",
        "REQ_BLOCK_PC",
        "REQ_SENSE",
@@ -1640,9 +1639,9 @@ static int blk_init_free_list(request_queue_t *q)
 
        rl->count[READ] = rl->count[WRITE] = 0;
        rl->starved[READ] = rl->starved[WRITE] = 0;
+       rl->elvpriv = 0;
        init_waitqueue_head(&rl->wait[READ]);
        init_waitqueue_head(&rl->wait[WRITE]);
-       init_waitqueue_head(&rl->drain);
 
        rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
                                mempool_free_slab, request_cachep, q->node);
@@ -1785,12 +1784,14 @@ EXPORT_SYMBOL(blk_get_queue);
 
 static inline void blk_free_request(request_queue_t *q, struct request *rq)
 {
-       elv_put_request(q, rq);
+       if (rq->flags & REQ_ELVPRIV)
+               elv_put_request(q, rq);
        mempool_free(rq, q->rq.rq_pool);
 }
 
 static inline struct request *
-blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
+                 int priv, int gfp_mask)
 {
        struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
@@ -1803,11 +1804,15 @@ blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
         */
        rq->flags = rw;
 
-       if (!elv_set_request(q, rq, bio, gfp_mask))
-               return rq;
+       if (priv) {
+               if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
+                       mempool_free(rq, q->rq.rq_pool);
+                       return NULL;
+               }
+               rq->flags |= REQ_ELVPRIV;
+       }
 
-       mempool_free(rq, q->rq.rq_pool);
-       return NULL;
+       return rq;
 }
 
 /*
@@ -1863,22 +1868,18 @@ static void __freed_request(request_queue_t *q, int rw)
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
-static void freed_request(request_queue_t *q, int rw)
+static void freed_request(request_queue_t *q, int rw, int priv)
 {
        struct request_list *rl = &q->rq;
 
        rl->count[rw]--;
+       if (priv)
+               rl->elvpriv--;
 
        __freed_request(q, rw);
 
        if (unlikely(rl->starved[rw ^ 1]))
                __freed_request(q, rw ^ 1);
-
-       if (!rl->count[READ] && !rl->count[WRITE]) {
-               smp_mb();
-               if (unlikely(waitqueue_active(&rl->drain)))
-                       wake_up(&rl->drain);
-       }
 }
 
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
@@ -1893,9 +1894,7 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
        struct request *rq = NULL;
        struct request_list *rl = &q->rq;
        struct io_context *ioc = current_io_context(GFP_ATOMIC);
-
-       if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
-               goto out;
+       int priv;
 
        if (rl->count[rw]+1 >= q->nr_requests) {
                /*
@@ -1940,9 +1939,14 @@ get_rq:
        rl->starved[rw] = 0;
        if (rl->count[rw] >= queue_congestion_on_threshold(q))
                set_queue_congested(q, rw);
+
+       priv = !test_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+       if (priv)
+               rl->elvpriv++;
+
        spin_unlock_irq(q->queue_lock);
 
-       rq = blk_alloc_request(q, rw, bio, gfp_mask);
+       rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
        if (!rq) {
                /*
                 * Allocation failed presumably due to memory. Undo anything
@@ -1952,7 +1956,7 @@ get_rq:
                 * wait queue, but this is pretty rare.
                 */
                spin_lock_irq(q->queue_lock);
-               freed_request(q, rw);
+               freed_request(q, rw, priv);
 
                /*
                 * in the very unlikely event that allocation failed and no
@@ -2470,11 +2474,12 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
         */
        if (rl) {
                int rw = rq_data_dir(req);
+               int priv = req->flags & REQ_ELVPRIV;
 
                BUG_ON(!list_empty(&req->queuelist));
 
                blk_free_request(q, req);
-               freed_request(q, rw);
+               freed_request(q, rw, priv);
        }
 }
 
@@ -2802,97 +2807,6 @@ static inline void blk_partition_remap(struct bio *bio)
        }
 }
 
-void blk_finish_queue_drain(request_queue_t *q)
-{
-       struct request_list *rl = &q->rq;
-       struct request *rq;
-       int requeued = 0;
-
-       spin_lock_irq(q->queue_lock);
-       clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-
-       while (!list_empty(&q->drain_list)) {
-               rq = list_entry_rq(q->drain_list.next);
-
-               list_del_init(&rq->queuelist);
-               elv_requeue_request(q, rq);
-               requeued++;
-       }
-
-       if (requeued)
-               q->request_fn(q);
-
-       spin_unlock_irq(q->queue_lock);
-
-       wake_up(&rl->wait[0]);
-       wake_up(&rl->wait[1]);
-       wake_up(&rl->drain);
-}
-
-static int wait_drain(request_queue_t *q, struct request_list *rl, int dispatch)
-{
-       int wait = rl->count[READ] + rl->count[WRITE];
-
-       if (dispatch)
-               wait += !list_empty(&q->queue_head);
-
-       return wait;
-}
-
-/*
- * We rely on the fact that only requests allocated through blk_alloc_request()
- * have io scheduler private data structures associated with them. Any other
- * type of request (allocated on stack or through kmalloc()) should not go
- * to the io scheduler core, but be attached to the queue head instead.
- */
-void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch)
-{
-       struct request_list *rl = &q->rq;
-       DEFINE_WAIT(wait);
-
-       spin_lock_irq(q->queue_lock);
-       set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-
-       while (wait_drain(q, rl, wait_dispatch)) {
-               prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
-
-               if (wait_drain(q, rl, wait_dispatch)) {
-                       __generic_unplug_device(q);
-                       spin_unlock_irq(q->queue_lock);
-                       io_schedule();
-                       spin_lock_irq(q->queue_lock);
-               }
-
-               finish_wait(&rl->drain, &wait);
-       }
-
-       spin_unlock_irq(q->queue_lock);
-}
-
-/*
- * block waiting for the io scheduler being started again.
- */
-static inline void block_wait_queue_running(request_queue_t *q)
-{
-       DEFINE_WAIT(wait);
-
-       while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
-               struct request_list *rl = &q->rq;
-
-               prepare_to_wait_exclusive(&rl->drain, &wait,
-                               TASK_UNINTERRUPTIBLE);
-
-               /*
-                * re-check the condition. avoids using prepare_to_wait()
-                * in the fast path (queue is running)
-                */
-               if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))
-                       io_schedule();
-
-               finish_wait(&rl->drain, &wait);
-       }
-}
-
 static void handle_bad_sector(struct bio *bio)
 {
        char b[BDEVNAME_SIZE];
@@ -2988,8 +2902,6 @@ end_io:
                if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
                        goto end_io;
 
-               block_wait_queue_running(q);
-
                /*
                 * If this device has partitions, remap block n
                 * of partition p to block n+start(p) of the disk.
index 159dbcd2eb5915e461ccdef889f5b00ef791cfd9..6186d5e2110fed06a5315a3692b75fdf21504b02 100644 (file)
@@ -107,9 +107,9 @@ typedef void (rq_end_io_fn)(struct request *);
 struct request_list {
        int count[2];
        int starved[2];
+       int elvpriv;
        mempool_t *rq_pool;
        wait_queue_head_t wait[2];
-       wait_queue_head_t drain;
 };
 
 #define BLK_MAX_CDB    16
@@ -211,6 +211,7 @@ enum rq_flag_bits {
        __REQ_STARTED,          /* drive already may have started this one */
        __REQ_DONTPREP,         /* don't call prep for this one */
        __REQ_QUEUED,           /* uses queueing */
+       __REQ_ELVPRIV,          /* elevator private data attached */
        /*
         * for ATA/ATAPI devices
         */
@@ -244,6 +245,7 @@ enum rq_flag_bits {
 #define REQ_STARTED    (1 << __REQ_STARTED)
 #define REQ_DONTPREP   (1 << __REQ_DONTPREP)
 #define REQ_QUEUED     (1 << __REQ_QUEUED)
+#define REQ_ELVPRIV    (1 << __REQ_ELVPRIV)
 #define REQ_PC         (1 << __REQ_PC)
 #define REQ_BLOCK_PC   (1 << __REQ_BLOCK_PC)
 #define REQ_SENSE      (1 << __REQ_SENSE)
@@ -413,8 +415,6 @@ struct request_queue
        unsigned int            sg_reserved_size;
        int                     node;
 
-       struct list_head        drain_list;
-
        /*
         * reserved for flush operations
         */
@@ -442,7 +442,7 @@ enum {
 #define QUEUE_FLAG_DEAD                5       /* queue being torn down */
 #define QUEUE_FLAG_REENTER     6       /* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED     7       /* queue is plugged */
-#define QUEUE_FLAG_DRAIN       8       /* draining queue for sched switch */
+#define QUEUE_FLAG_BYPASS      8       /* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_FLUSH       9       /* doing barrier flush sequence */
 
 #define blk_queue_plugged(q)   test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -668,8 +668,6 @@ extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(request_queue_t *);
 extern void __generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
-extern void blk_wait_queue_drained(request_queue_t *, int);
-extern void blk_finish_queue_drain(request_queue_t *);
 
 int blk_get_queue(request_queue_t *);
 request_queue_t *blk_alloc_queue(int gfp_mask);