md/raid1: fix potential data inconsistency issue with write behind device
authorGuoqing Jiang <gqjiang@suse.com>
Wed, 19 Jun 2019 09:30:46 +0000 (17:30 +0800)
committerSong Liu <songliubraving@fb.com>
Thu, 20 Jun 2019 23:35:59 +0000 (16:35 -0700)
For write-behind mode, we think write IO is complete once it has
reached all the non-writemostly devices. It works fine for single
queue devices.

But for multiqueue device, if there are lots of IOs come from upper
layer, then the write-behind device could issue those IOs to different
queues, depends on the each queue's delay, so there is no guarantee
that those IOs can arrive in order.

To address the issue, we need to check the collision among write
behind IOs, we can only continue without collision, otherwise wait
for the completion of previous collisioned IO.

And WBCollision is introduced for multiqueue device which is worked
under write-behind mode.

But this patch doesn't handle below cases which could have the data
inconsistency issue as well, these cases will be handled in later
patches.

1. modify max_write_behind by write backlog node.
2. add or remove array's bitmap dynamically.
3. the change of member disk.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
drivers/md/md.c
drivers/md/md.h
drivers/md/raid1.c

index 1f37a1adc926692a4916f03e0c47484cf63a24c6..9a9762c83cc866882e72318a1b2b11d124e84fa1 100644 (file)
@@ -124,6 +124,19 @@ static inline int speed_max(struct mddev *mddev)
                mddev->sync_speed_max : sysctl_speed_limit_max;
 }
 
+static int rdev_init_wb(struct md_rdev *rdev)
+{
+       if (rdev->bdev->bd_queue->nr_hw_queues == 1)
+               return 0;
+
+       spin_lock_init(&rdev->wb_list_lock);
+       INIT_LIST_HEAD(&rdev->wb_list);
+       init_waitqueue_head(&rdev->wb_io_wait);
+       set_bit(WBCollisionCheck, &rdev->flags);
+
+       return 1;
+}
+
 static struct ctl_table_header *raid_table_header;
 
 static struct ctl_table raid_table[] = {
@@ -5597,6 +5610,32 @@ int md_run(struct mddev *mddev)
                md_bitmap_destroy(mddev);
                goto abort;
        }
+
+       if (mddev->bitmap_info.max_write_behind > 0) {
+               bool creat_pool = false;
+
+               rdev_for_each(rdev, mddev) {
+                       if (test_bit(WriteMostly, &rdev->flags) &&
+                           rdev_init_wb(rdev))
+                               creat_pool = true;
+               }
+               if (creat_pool && mddev->wb_info_pool == NULL) {
+                       mddev->wb_info_pool =
+                               mempool_create_kmalloc_pool(NR_WB_INFOS,
+                                                   sizeof(struct wb_info));
+                       if (!mddev->wb_info_pool) {
+                               err = -ENOMEM;
+                               mddev_detach(mddev);
+                               if (mddev->private)
+                                       pers->free(mddev, mddev->private);
+                               mddev->private = NULL;
+                               module_put(pers->owner);
+                               md_bitmap_destroy(mddev);
+                               goto abort;
+                       }
+               }
+       }
+
        if (mddev->queue) {
                bool nonrot = true;
 
@@ -5825,6 +5864,8 @@ static void __md_stop_writes(struct mddev *mddev)
                        mddev->in_sync = 1;
                md_update_sb(mddev, 1);
        }
+       mempool_destroy(mddev->wb_info_pool);
+       mddev->wb_info_pool = NULL;
 }
 
 void md_stop_writes(struct mddev *mddev)
index 7c930c0911932f5c594195f4d471ab76b0a73df8..d449d514cff9886471b6530dcb08454472f77403 100644 (file)
@@ -109,6 +109,14 @@ struct md_rdev {
                                           * for reporting to userspace and storing
                                           * in superblock.
                                           */
+
+       /*
+        * The members for check collision of write behind IOs.
+        */
+       struct list_head wb_list;
+       spinlock_t wb_list_lock;
+       wait_queue_head_t wb_io_wait;
+
        struct work_struct del_work;    /* used for delayed sysfs removal */
 
        struct kernfs_node *sysfs_state; /* handle for 'state'
@@ -193,6 +201,10 @@ enum flag_bits {
                                 * it didn't fail, so don't use FailFast
                                 * any more for metadata
                                 */
+       WBCollisionCheck,       /*
+                                * multiqueue device should check if there
+                                * is collision between write behind bios.
+                                */
 };
 
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -245,6 +257,14 @@ enum mddev_sb_flags {
        MD_SB_NEED_REWRITE,     /* metadata write needs to be repeated */
 };
 
+#define NR_WB_INFOS    8
+/* record current range of write behind IOs */
+struct wb_info {
+       sector_t lo;
+       sector_t hi;
+       struct list_head list;
+};
+
 struct mddev {
        void                            *private;
        struct md_personality           *pers;
@@ -461,6 +481,7 @@ struct mddev {
                                          */
        struct work_struct flush_work;
        struct work_struct event_work;  /* used by dm to report failure event */
+       mempool_t *wb_info_pool;
        void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
        struct md_cluster_info          *cluster_info;
        unsigned int                    good_device_nr; /* good device num within cluster raid */
index a7860b5f33f2dc66c8e66393f00b1e8781a9b920..3d44da6637973963954c19560d5f38685c5949fa 100644 (file)
@@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #include "raid1-10.c"
 
+static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+       struct wb_info *wi, *temp_wi;
+       unsigned long flags;
+       int ret = 0;
+       struct mddev *mddev = rdev->mddev;
+
+       wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
+
+       spin_lock_irqsave(&rdev->wb_list_lock, flags);
+       list_for_each_entry(temp_wi, &rdev->wb_list, list) {
+               /* collision happened */
+               if (hi > temp_wi->lo && lo < temp_wi->hi) {
+                       ret = -EBUSY;
+                       break;
+               }
+       }
+
+       if (!ret) {
+               wi->lo = lo;
+               wi->hi = hi;
+               list_add(&wi->list, &rdev->wb_list);
+       } else
+               mempool_free(wi, mddev->wb_info_pool);
+       spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+
+       return ret;
+}
+
+static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+       struct wb_info *wi;
+       unsigned long flags;
+       int found = 0;
+       struct mddev *mddev = rdev->mddev;
+
+       spin_lock_irqsave(&rdev->wb_list_lock, flags);
+       list_for_each_entry(wi, &rdev->wb_list, list)
+               if (hi == wi->hi && lo == wi->lo) {
+                       list_del(&wi->list);
+                       mempool_free(wi, mddev->wb_info_pool);
+                       found = 1;
+                       break;
+               }
+
+       if (!found)
+               WARN_ON("The write behind IO is not recorded\n");
+       spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+       wake_up(&rdev->wb_io_wait);
+}
+
 /*
  * for resync bio, r1bio pointer can be retrieved from the per-bio
  * 'struct resync_pages'.
@@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio)
        }
 
        if (behind) {
+               if (test_bit(WBCollisionCheck, &rdev->flags)) {
+                       sector_t lo = r1_bio->sector;
+                       sector_t hi = r1_bio->sector + r1_bio->sectors;
+
+                       remove_wb(rdev, lo, hi);
+               }
                if (test_bit(WriteMostly, &rdev->flags))
                        atomic_dec(&r1_bio->behind_remaining);
 
@@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                        mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 
                if (r1_bio->behind_master_bio) {
-                       if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+                       struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+                       if (test_bit(WBCollisionCheck, &rdev->flags)) {
+                               sector_t lo = r1_bio->sector;
+                               sector_t hi = r1_bio->sector + r1_bio->sectors;
+
+                               wait_event(rdev->wb_io_wait,
+                                          check_and_add_wb(rdev, lo, hi) == 0);
+                       }
+                       if (test_bit(WriteMostly, &rdev->flags))
                                atomic_inc(&r1_bio->behind_remaining);
                }