dm zoned: properly handle backing device failure
authorDmitry Fomichev <dmitry.fomichev@wdc.com>
Sat, 10 Aug 2019 21:43:11 +0000 (14:43 -0700)
committerMike Snitzer <snitzer@redhat.com>
Thu, 15 Aug 2019 19:57:42 +0000 (15:57 -0400)
dm-zoned is observed to lock up or livelock in case of hardware
failure or some misconfiguration of the backing zoned device.

This patch adds a new dm-zoned target function that checks the status of
the backing device. If the request queue of the backing device is found
to be in dying state or the SCSI backing device enters offline state,
the health check code sets a dm-zoned target flag prompting all further
incoming I/O to be rejected. In order to detect backing device failures
timely, this new function is called in the request mapping path, at the
beginning of every reclaim run and before performing any metadata I/O.

The proper way out of this situation is to do

dmsetup remove <dm-zoned target>

and recreate the target when the problem with the backing device
is resolved.

Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target")
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
drivers/md/dm-zoned-metadata.c
drivers/md/dm-zoned-reclaim.c
drivers/md/dm-zoned-target.c
drivers/md/dm-zoned.h

index 935d9be5af3967a4a671d9c1c215aa9b84d0753f..2882897aece20891c9f7bb22840e36504321b19b 100644 (file)
@@ -402,15 +402,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
        sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
        struct bio *bio;
 
+       if (dmz_bdev_is_dying(zmd->dev))
+               return ERR_PTR(-EIO);
+
        /* Get a new block and a BIO to read it */
        mblk = dmz_alloc_mblock(zmd, mblk_no);
        if (!mblk)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        bio = bio_alloc(GFP_NOIO, 1);
        if (!bio) {
                dmz_free_mblock(zmd, mblk);
-               return NULL;
+               return ERR_PTR(-ENOMEM);
        }
 
        spin_lock(&zmd->mblk_lock);
@@ -541,8 +544,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
        if (!mblk) {
                /* Cache miss: read the block from disk */
                mblk = dmz_get_mblock_slow(zmd, mblk_no);
-               if (!mblk)
-                       return ERR_PTR(-ENOMEM);
+               if (IS_ERR(mblk))
+                       return mblk;
        }
 
        /* Wait for on-going read I/O and check for error */
@@ -570,16 +573,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
 /*
  * Issue a metadata block write BIO.
  */
-static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
-                            unsigned int set)
+static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+                           unsigned int set)
 {
        sector_t block = zmd->sb[set].block + mblk->no;
        struct bio *bio;
 
+       if (dmz_bdev_is_dying(zmd->dev))
+               return -EIO;
+
        bio = bio_alloc(GFP_NOIO, 1);
        if (!bio) {
                set_bit(DMZ_META_ERROR, &mblk->state);
-               return;
+               return -ENOMEM;
        }
 
        set_bit(DMZ_META_WRITING, &mblk->state);
@@ -591,6 +597,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
        bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
        bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
        submit_bio(bio);
+
+       return 0;
 }
 
 /*
@@ -602,6 +610,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
        struct bio *bio;
        int ret;
 
+       if (dmz_bdev_is_dying(zmd->dev))
+               return -EIO;
+
        bio = bio_alloc(GFP_NOIO, 1);
        if (!bio)
                return -ENOMEM;
@@ -659,22 +670,29 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
 {
        struct dmz_mblock *mblk;
        struct blk_plug plug;
-       int ret = 0;
+       int ret = 0, nr_mblks_submitted = 0;
 
        /* Issue writes */
        blk_start_plug(&plug);
-       list_for_each_entry(mblk, write_list, link)
-               dmz_write_mblock(zmd, mblk, set);
+       list_for_each_entry(mblk, write_list, link) {
+               ret = dmz_write_mblock(zmd, mblk, set);
+               if (ret)
+                       break;
+               nr_mblks_submitted++;
+       }
        blk_finish_plug(&plug);
 
        /* Wait for completion */
        list_for_each_entry(mblk, write_list, link) {
+               if (!nr_mblks_submitted)
+                       break;
                wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
                               TASK_UNINTERRUPTIBLE);
                if (test_bit(DMZ_META_ERROR, &mblk->state)) {
                        clear_bit(DMZ_META_ERROR, &mblk->state);
                        ret = -EIO;
                }
+               nr_mblks_submitted--;
        }
 
        /* Flush drive cache (this will also sync data) */
@@ -736,6 +754,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
         */
        dmz_lock_flush(zmd);
 
+       if (dmz_bdev_is_dying(zmd->dev)) {
+               ret = -EIO;
+               goto out;
+       }
+
        /* Get dirty blocks */
        spin_lock(&zmd->mblk_lock);
        list_splice_init(&zmd->mblk_dirty_list, &write_list);
@@ -1631,6 +1654,10 @@ again:
                /* Alloate a random zone */
                dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
                if (!dzone) {
+                       if (dmz_bdev_is_dying(zmd->dev)) {
+                               dzone = ERR_PTR(-EIO);
+                               goto out;
+                       }
                        dmz_wait_for_free_zones(zmd);
                        goto again;
                }
@@ -1728,6 +1755,10 @@ again:
        /* Alloate a random zone */
        bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
        if (!bzone) {
+               if (dmz_bdev_is_dying(zmd->dev)) {
+                       bzone = ERR_PTR(-EIO);
+                       goto out;
+               }
                dmz_wait_for_free_zones(zmd);
                goto again;
        }
index e381354dc13689621efc844bc41212337d7a8572..9470b8f77a337bb22e639ea8e45defc12cb59cb2 100644 (file)
@@ -37,7 +37,7 @@ enum {
 /*
  * Number of seconds of target BIO inactivity to consider the target idle.
  */
-#define DMZ_IDLE_PERIOD                (10UL * HZ)
+#define DMZ_IDLE_PERIOD                        (10UL * HZ)
 
 /*
  * Percentage of unmapped (free) random zones below which reclaim starts
@@ -134,6 +134,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
                set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
 
        while (block < end_block) {
+               if (dev->flags & DMZ_BDEV_DYING)
+                       return -EIO;
+
                /* Get a valid region from the source zone */
                ret = dmz_first_valid_block(zmd, src_zone, &block);
                if (ret <= 0)
@@ -451,6 +454,9 @@ static void dmz_reclaim_work(struct work_struct *work)
        unsigned int p_unmap_rnd;
        int ret;
 
+       if (dmz_bdev_is_dying(zrc->dev))
+               return;
+
        if (!dmz_should_reclaim(zrc)) {
                mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
                return;
@@ -480,8 +486,16 @@ static void dmz_reclaim_work(struct work_struct *work)
                      p_unmap_rnd, nr_unmap_rnd, nr_rnd);
 
        ret = dmz_do_reclaim(zrc);
-       if (ret)
+       if (ret) {
                dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
+               if (ret == -EIO)
+                       /*
+                        * LLD might be performing some error handling sequence
+                        * at the underlying device. To not interfere, do not
+                        * attempt to schedule the next reclaim run immediately.
+                        */
+                       return;
+       }
 
        dmz_schedule_reclaim(zrc);
 }
index 944db71ed3d76cb0e26640a2e79e7a68ec571159..ff3fd011796ed757ecda391e106c12e27d0669dc 100644 (file)
@@ -133,6 +133,8 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
 
        refcount_inc(&bioctx->ref);
        generic_make_request(clone);
+       if (clone->bi_status == BLK_STS_IOERR)
+               return -EIO;
 
        if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
                zone->wp_block += nr_blocks;
@@ -277,8 +279,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
 
        /* Get the buffer zone. One will be allocated if needed */
        bzone = dmz_get_chunk_buffer(zmd, zone);
-       if (!bzone)
-               return -ENOSPC;
+       if (IS_ERR(bzone))
+               return PTR_ERR(bzone);
 
        if (dmz_is_readonly(bzone))
                return -EROFS;
@@ -389,6 +391,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
 
        dmz_lock_metadata(zmd);
 
+       if (dmz->dev->flags & DMZ_BDEV_DYING) {
+               ret = -EIO;
+               goto out;
+       }
+
        /*
         * Get the data zone mapping the chunk. There may be no
         * mapping for read and discard. If a mapping is obtained,
@@ -493,6 +500,8 @@ static void dmz_flush_work(struct work_struct *work)
 
        /* Flush dirty metadata blocks */
        ret = dmz_flush_metadata(dmz->metadata);
+       if (ret)
+               dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
 
        /* Process queued flush requests */
        while (1) {
@@ -556,6 +565,32 @@ out:
        return ret;
 }
 
+/*
+ * Check the backing device availability. If it's on the way out,
+ * start failing I/O. Reclaim and metadata components also call this
+ * function to cleanly abort operation in the event of such failure.
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
+{
+       struct gendisk *disk;
+
+       if (!(dmz_dev->flags & DMZ_BDEV_DYING)) {
+               disk = dmz_dev->bdev->bd_disk;
+               if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
+                       dmz_dev_warn(dmz_dev, "Backing device queue dying");
+                       dmz_dev->flags |= DMZ_BDEV_DYING;
+               } else if (disk->fops->check_events) {
+                       if (disk->fops->check_events(disk, 0) &
+                                       DISK_EVENT_MEDIA_CHANGE) {
+                               dmz_dev_warn(dmz_dev, "Backing device offline");
+                               dmz_dev->flags |= DMZ_BDEV_DYING;
+                       }
+               }
+       }
+
+       return dmz_dev->flags & DMZ_BDEV_DYING;
+}
+
 /*
  * Process a new BIO.
  */
@@ -569,6 +604,9 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
        sector_t chunk_sector;
        int ret;
 
+       if (dmz_bdev_is_dying(dmz->dev))
+               return DM_MAPIO_KILL;
+
        dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
                      bio_op(bio), (unsigned long long)sector, nr_sectors,
                      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
@@ -865,6 +903,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
 {
        struct dmz_target *dmz = ti->private;
 
+       if (dmz_bdev_is_dying(dmz->dev))
+               return -ENODEV;
+
        *bdev = dmz->dev->bdev;
 
        return 0;
index ed8de49c9a08263e8ca25ff2979abb2958b4cde0..93a64529f21902968a705c6ea88a292ad6161d24 100644 (file)
@@ -56,6 +56,8 @@ struct dmz_dev {
 
        unsigned int            nr_zones;
 
+       unsigned int            flags;
+
        sector_t                zone_nr_sectors;
        unsigned int            zone_nr_sectors_shift;
 
@@ -67,6 +69,9 @@ struct dmz_dev {
                                 (dev)->zone_nr_sectors_shift)
 #define dmz_chunk_block(dev, b)        ((b) & ((dev)->zone_nr_blocks - 1))
 
+/* Device flags. */
+#define DMZ_BDEV_DYING         (1 << 0)
+
 /*
  * Zone descriptor.
  */
@@ -245,4 +250,9 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc);
 void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
 void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
 
+/*
+ * Functions defined in dm-zoned-target.c
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
+
 #endif /* DM_ZONED_H */