md: fix and update workqueue usage
authorTejun Heo <tj@kernel.org>
Fri, 15 Oct 2010 13:36:08 +0000 (15:36 +0200)
committerNeilBrown <neilb@suse.de>
Thu, 28 Oct 2010 06:32:29 +0000 (17:32 +1100)
Workqueue usage in md has two problems.

* Flush can be used during or depended upon by memory reclaim, but md
  uses the system workqueue for flush_work which may lead to deadlock.

* md depends on flush_scheduled_work() to achieve exclusion against
  completion of removal of previous instances.  flush_scheduled_work()
  may incur unexpected amount of delay and is scheduled to be removed.

This patch adds two workqueues to md - md_wq and md_misc_wq.  The
former is guaranteed to make forward progress under memory pressure
and serves flush_work.  The latter serves as the flush domain for
other works.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
drivers/md/md.c

index 2399168b631560e03d6307c1d8eed1b180888cd0..0b6fa2a1882acb4eff4f01ebb7cb2b386dc04995 100644 (file)
@@ -67,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock);
 static void md_print_devices(void);
 
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+static struct workqueue_struct *md_wq;
+static struct workqueue_struct *md_misc_wq;
 
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
@@ -298,7 +300,7 @@ static void md_end_flush(struct bio *bio, int err)
 
        if (atomic_dec_and_test(&mddev->flush_pending)) {
                /* The pre-request flush has finished */
-               schedule_work(&mddev->flush_work);
+               queue_work(md_wq, &mddev->flush_work);
        }
        bio_put(bio);
 }
@@ -367,7 +369,7 @@ void md_flush_request(mddev_t *mddev, struct bio *bio)
        submit_flushes(mddev);
 
        if (atomic_dec_and_test(&mddev->flush_pending))
-               schedule_work(&mddev->flush_work);
+               queue_work(md_wq, &mddev->flush_work);
 }
 EXPORT_SYMBOL(md_flush_request);
 
@@ -434,14 +436,13 @@ static void mddev_put(mddev_t *mddev)
                 * so destroy it */
                list_del(&mddev->all_mddevs);
                if (mddev->gendisk) {
-                       /* we did a probe so need to clean up.
-                        * Call schedule_work inside the spinlock
-                        * so that flush_scheduled_work() after
-                        * mddev_find will succeed in waiting for the
-                        * work to be done.
+                       /* We did a probe so need to clean up.  Call
+                        * queue_work inside the spinlock so that
+                        * flush_workqueue() after mddev_find will
+                        * succeed in waiting for the work to be done.
                         */
                        INIT_WORK(&mddev->del_work, mddev_delayed_delete);
-                       schedule_work(&mddev->del_work);
+                       queue_work(md_misc_wq, &mddev->del_work);
                } else
                        kfree(mddev);
        }
@@ -1848,7 +1849,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
        synchronize_rcu();
        INIT_WORK(&rdev->del_work, md_delayed_delete);
        kobject_get(&rdev->kobj);
-       schedule_work(&rdev->del_work);
+       queue_work(md_misc_wq, &rdev->del_work);
 }
 
 /*
@@ -4192,10 +4193,10 @@ static int md_alloc(dev_t dev, char *name)
        shift = partitioned ? MdpMinorShift : 0;
        unit = MINOR(mddev->unit) >> shift;
 
-       /* wait for any previous instance if this device
-        * to be completed removed (mddev_delayed_delete).
+       /* wait for any previous instance of this device to be
+        * completely removed (mddev_delayed_delete).
         */
-       flush_scheduled_work();
+       flush_workqueue(md_misc_wq);
 
        mutex_lock(&disks_mutex);
        error = -EEXIST;
@@ -5891,7 +5892,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
                 */
                mddev_put(mddev);
                /* Wait until bdev->bd_disk is definitely gone */
-               flush_scheduled_work();
+               flush_workqueue(md_misc_wq);
                /* Then retry the open from the top */
                return -ERESTARTSYS;
        }
@@ -6047,7 +6048,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        if (mddev->event_work.func)
-               schedule_work(&mddev->event_work);
+               queue_work(md_misc_wq, &mddev->event_work);
        md_new_event_inintr(mddev);
 }
 
@@ -7207,12 +7208,23 @@ static void md_geninit(void)
 
 static int __init md_init(void)
 {
-       if (register_blkdev(MD_MAJOR, "md"))
-               return -1;
-       if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
-               unregister_blkdev(MD_MAJOR, "md");
-               return -1;
-       }
+       int ret = -ENOMEM;
+
+       md_wq = alloc_workqueue("md", WQ_RESCUER, 0);
+       if (!md_wq)
+               goto err_wq;
+
+       md_misc_wq = alloc_workqueue("md_misc", 0, 0);
+       if (!md_misc_wq)
+               goto err_misc_wq;
+
+       if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+               goto err_md;
+
+       if ((ret = register_blkdev(0, "mdp")) < 0)
+               goto err_mdp;
+       mdp_major = ret;
+
        blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
@@ -7223,8 +7235,16 @@ static int __init md_init(void)
 
        md_geninit();
        return 0;
-}
 
+err_mdp:
+       unregister_blkdev(MD_MAJOR, "md");
+err_md:
+       destroy_workqueue(md_misc_wq);
+err_misc_wq:
+       destroy_workqueue(md_wq);
+err_wq:
+       return ret;
+}
 
 #ifndef MODULE
 
@@ -7311,6 +7331,8 @@ static __exit void md_exit(void)
                export_array(mddev);
                mddev->hold_active = 0;
        }
+       destroy_workqueue(md_misc_wq);
+       destroy_workqueue(md_wq);
 }
 
 subsys_initcall(md_init);