dm raid: add raid4/5/6 journaling support

author Heinz Mauelshagen <heinzm@redhat.com>

Wed, 30 Nov 2016 21:31:05 +0000 (22:31 +0100)

committer Mike Snitzer <snitzer@redhat.com>

Wed, 25 Jan 2017 11:49:06 +0000 (12:49 +0100)
author Heinz Mauelshagen <heinzm@redhat.com>
Wed, 30 Nov 2016 21:31:05 +0000 (22:31 +0100)
committer Mike Snitzer <snitzer@redhat.com>
Wed, 25 Jan 2017 11:49:06 +0000 (12:49 +0100)
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt

index c84cdd220f9d975b1df41478af335a6497207cca..0d199353e4776b60e40062cd3f57f98be6166cb7 100644 (file)
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -161,6 +161,15 @@ The target is named "raid" and it accepts the following parameters:
                 the RAID type (i.e. the allocation algorithm) as well, e.g.
                 changing from raid5_ls to raid5_n.
  
+       [journal_dev <dev>]
+               This option adds a journal device to raid4/5/6 raid sets and
+               uses it to close the 'write hole' caused by the non-atomic updates
+               to the component devices which can cause data loss during recovery.
+               The journal device is used as writethrough thus causing writes to
+               be throttled versus non-journaled raid4/5/6 sets.
+               Takeover/reshape is not possible with a raid4/5/6 journal device;
+               it has to be deconfigured before requesting these.
+
  <#raid_devs>: The number of devices composing the array.
         Each device consists of two entries.  The first is the device
         containing the metadata (if any); the second is the one containing the
@@ -245,6 +254,9 @@ recovery.  Here is a fuller description of the individual fields:
         <data_offset>   The current data offset to the start of the user data on
                         each component device of a raid set (see the respective
                         raid parameter to support out-of-place reshaping).
+       <journal_char>  'A' - active raid4/5/6 journal device.
+                       'D' - dead journal device.
+                       '-' - no journal device.
  
  
  Message Interface
@@ -318,3 +330,4 @@ Version History
         fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and
         'D' on the status line.  If '- -' is passed into the constructor, emit
         '- -' on the table line and '-' as the status line health character.
+1.10.0  Add support for raid4/5/6 journal device
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c

index d7e652a22a66d1a609c97f1a7b3614227cb10d98..e52c493212d05a0a20a748d161b636bbccd2e933 100644 (file)
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -24,6 +24,11 @@
   */
  #define        MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
  
+/*
+ * Minimum journal space 4 MiB in sectors.
+ */
+#define        MIN_RAID456_JOURNAL_SPACE (4*2048)
+
  static bool devices_handle_discard_safely = false;
  
  /*
@@ -73,6 +78,9 @@ struct raid_dev {
  #define __CTR_FLAG_DATA_OFFSET         13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
  #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
  
+/* New for v1.10.0 */
+#define __CTR_FLAG_JOURNAL_DEV         15 /* 2 */ /* Only with raid4/5/6! */
+
  /*
   * Flags for rs->ctr_flags field.
   */
@@ -91,6 +99,7 @@ struct raid_dev {
  #define CTR_FLAG_DELTA_DISKS           (1 << __CTR_FLAG_DELTA_DISKS)
  #define CTR_FLAG_DATA_OFFSET           (1 << __CTR_FLAG_DATA_OFFSET)
  #define CTR_FLAG_RAID10_USE_NEAR_SETS  (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
+#define CTR_FLAG_JOURNAL_DEV           (1 << __CTR_FLAG_JOURNAL_DEV)
  
  /*
   * Definitions of various constructor flags to
@@ -163,7 +172,8 @@ struct raid_dev {
                                  CTR_FLAG_STRIPE_CACHE | \
                                  CTR_FLAG_REGION_SIZE | \
                                  CTR_FLAG_DELTA_DISKS | \
-                                CTR_FLAG_DATA_OFFSET)
+                                CTR_FLAG_DATA_OFFSET | \
+                                CTR_FLAG_JOURNAL_DEV)
  
  #define RAID6_VALID_FLAGS      (CTR_FLAG_SYNC | \
                                  CTR_FLAG_REBUILD | \
@@ -173,7 +183,8 @@ struct raid_dev {
                                  CTR_FLAG_STRIPE_CACHE | \
                                  CTR_FLAG_REGION_SIZE | \
                                  CTR_FLAG_DELTA_DISKS | \
-                                CTR_FLAG_DATA_OFFSET)
+                                CTR_FLAG_DATA_OFFSET | \
+                                CTR_FLAG_JOURNAL_DEV)
  /* ...valid options definitions per raid level */
  
  /*
@@ -222,6 +233,12 @@ struct raid_set {
         struct raid_type *raid_type;
         struct dm_target_callbacks callbacks;
  
+       /* Optional raid4/5/6 journal device */
+       struct journal_dev {
+               struct dm_dev *dev;
+               struct md_rdev rdev;
+       } journal_dev;
+
         struct raid_dev dev[0];
  };
  
@@ -306,6 +323,7 @@ static struct arg_name_flag {
         { CTR_FLAG_DATA_OFFSET, "data_offset"},
         { CTR_FLAG_DELTA_DISKS, "delta_disks"},
         { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
+       { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
  };
  
  /* Return argument name string for given @flag */
@@ -627,7 +645,8 @@ static void rs_set_capacity(struct raid_set *rs)
          * is unintended in case of out-of-place reshaping
          */
         rdev_for_each(rdev, mddev)
-               rdev->sectors = mddev->dev_sectors;
+               if (!test_bit(Journal, &rdev->flags))
+                       rdev->sectors = mddev->dev_sectors;
  
         set_capacity(gendisk, mddev->array_sectors);
         revalidate_disk(gendisk);
@@ -713,6 +732,11 @@ static void raid_set_free(struct raid_set *rs)
  {
         int i;
  
+       if (rs->journal_dev.dev) {
+               md_rdev_clear(&rs->journal_dev.rdev);
+               dm_put_device(rs->ti, rs->journal_dev.dev);
+       }
+
         for (i = 0; i < rs->raid_disks; i++) {
                 if (rs->dev[i].meta_dev)
                         dm_put_device(rs->ti, rs->dev[i].meta_dev);
@@ -760,10 +784,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
                 rs->dev[i].data_dev = NULL;
  
                 /*
-                * There are no offsets, since there is a separate device
-                * for data and metadata.
+                * There are no offsets initially.
+                * Out of place reshape will set them accordingly.
                  */
                 rs->dev[i].rdev.data_offset = 0;
+               rs->dev[i].rdev.new_data_offset = 0;
                 rs->dev[i].rdev.mddev = &rs->md;
  
                 arg = dm_shift_arg(as);
@@ -821,6 +846,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
                         rebuild++;
         }
  
+       if (rs->journal_dev.dev)
+               list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks);
+
         if (metadata_available) {
                 rs->md.external = 0;
                 rs->md.persistent = 1;
@@ -1026,6 +1054,8 @@ too_many:
   *    [max_write_behind <sectors>]     See '-write-behind=' (man mdadm)
   *    [stripe_cache <sectors>]         Stripe cache size for higher RAIDs
   *    [region_size <sectors>]          Defines granularity of bitmap
+ *    [journal_dev <dev>]              raid4/5/6 journaling deviice
+ *                                     (i.e. write hole closing log)
   *
   * RAID10-only options:
   *    [raid10_copies <# copies>]       Number of copies.  (Default: 2)
@@ -1133,7 +1163,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
                 /*
                  * Parameters that take a string value are checked here.
                  */
-
+               /* "raid10_format {near|offset|far} */
                 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
                         if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
                                 rs->ti->error = "Only one 'raid10_format' argument pair allowed";
@@ -1151,6 +1181,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
                         continue;
                 }
  
+               /* "journal_dev dev" */
+               if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
+                       int r;
+                       struct md_rdev *jdev;
+
+                       if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+                               rs->ti->error = "Only one raid4/5/6 set journaling device allowed";
+                               return -EINVAL;
+                       }
+                       if (!rt_is_raid456(rt)) {
+                               rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type";
+                               return -EINVAL;
+                       }
+                       r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+                                         &rs->journal_dev.dev);
+                       if (r) {
+                               rs->ti->error = "raid4/5/6 journal device lookup failure";
+                               return r;
+                       }
+                       jdev = &rs->journal_dev.rdev;
+                       md_rdev_init(jdev);
+                       jdev->mddev = &rs->md;
+                       jdev->bdev = rs->journal_dev.dev->bdev;
+                       jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
+                       if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
+                               rs->ti->error = "No space for raid4/5/6 journal";
+                               return -ENOSPC;
+                       }
+                       set_bit(Journal, &jdev->flags);
+                       continue;
+               }
+
+               /*
+                * Parameters with number values from here on.
+                */
                 if (kstrtoint(arg, 10, &value) < 0) {
                         rs->ti->error = "Bad numerical argument given in raid params";
                         return -EINVAL;
@@ -1436,7 +1501,8 @@ static sector_t __rdev_sectors(struct raid_set *rs)
         for (i = 0; i < rs->md.raid_disks; i++) {
                 struct md_rdev *rdev = &rs->dev[i].rdev;
  
-               if (rdev->bdev && rdev->sectors)
+               if (!test_bit(Journal, &rdev->flags) &&
+                   rdev->bdev && rdev->sectors)
                         return rdev->sectors;
         }
  
@@ -1486,7 +1552,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
                 array_sectors = (data_stripes + delta_disks) * dev_sectors;
  
         rdev_for_each(rdev, mddev)
-               rdev->sectors = dev_sectors;
+               if (!test_bit(Journal, &rdev->flags))
+                       rdev->sectors = dev_sectors;
  
         mddev->array_sectors = array_sectors;
         mddev->dev_sectors = dev_sectors;
@@ -2164,6 +2231,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
          */
         d = 0;
         rdev_for_each(r, mddev) {
+               if (test_bit(Journal, &rdev->flags))
+                       continue;
+
                 if (test_bit(FirstUse, &r->flags))
                         new_devs++;
  
@@ -2219,7 +2289,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
          */
         sb_retrieve_failed_devices(sb, failed_devices);
         rdev_for_each(r, mddev) {
-               if (!r->sb_page)
+               if (test_bit(Journal, &rdev->flags) ||
+                   !r->sb_page)
                         continue;
                 sb2 = page_address(r->sb_page);
                 sb2->failed_devices = 0;
@@ -2339,6 +2410,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
  
         freshest = NULL;
         rdev_for_each(rdev, mddev) {
+               if (test_bit(Journal, &rdev->flags))
+                       continue;
+
                 /*
                  * Skipping super_load due to CTR_FLAG_SYNC will cause
                  * the array to undergo initialization again as
@@ -2402,7 +2476,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                 return -EINVAL;
  
         rdev_for_each(rdev, mddev)
-               if ((rdev != freshest) && super_validate(rs, rdev))
+               if (!test_bit(Journal, &rdev->flags) &&
+                   rdev != freshest &&
+                   super_validate(rs, rdev))
                         return -EINVAL;
         return 0;
  }
@@ -2489,10 +2565,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
                 return -ENOSPC;
         }
  out:
-       /* Adjust data offsets on all rdevs */
+       /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
         rdev_for_each(rdev, &rs->md) {
-               rdev->data_offset = data_offset;
-               rdev->new_data_offset = new_data_offset;
+               if (!test_bit(Journal, &rdev->flags)) {
+                       rdev->data_offset = data_offset;
+                       rdev->new_data_offset = new_data_offset;
+               }
         }
  
         return 0;
@@ -2505,8 +2583,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs)
         struct md_rdev *rdev;
  
         rdev_for_each(rdev, &rs->md) {
-               rdev->raid_disk = i++;
-               rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+               if (!test_bit(Journal, &rdev->flags)) {
+                       rdev->raid_disk = i++;
+                       rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+               }
         }
  }
  
@@ -2903,6 +2983,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                         goto bad;
                 }
  
+               /* We can't takeover a journaled raid4/5/6 */
+               if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+                       ti->error = "Can't takeover a journaled raid4/5/6 set";
+                       r = -EPERM;
+                       goto bad;
+               }
+
                 /*
                  * If a takeover is needed, userspace sets any additional
                  * devices to rebuild and we can check for a valid request here.
@@ -2924,6 +3011,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                 rs_setup_recovery(rs, MaxSector);
                 rs_set_new(rs);
         } else if (rs_reshape_requested(rs)) {
+               /*
+                * No need to check for 'ongoing' takeover here, because takeover
+                * is an instant operation as oposed to an ongoing reshape.
+                */
+
+               /* We can't reshape a journaled raid4/5/6 */
+               if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+                       ti->error = "Can't reshape a journaled raid4/5/6 set";
+                       r = -EPERM;
+                       goto bad;
+               }
+
                 /*
                   * We can only prepare for a reshape here, because the
                   * raid set needs to run to provide the repective reshape
@@ -3072,13 +3171,13 @@ static const char *decipher_sync_action(struct mddev *mddev)
  }
  
  /*
- * Return status string @rdev
+ * Return status string for @rdev
   *
   * Status characters:
   *
- *  'D' = Dead/Failed device
+ *  'D' = Dead/Failed raid set component or raid4/5/6 journal device
   *  'a' = Alive but not in-sync
- *  'A' = Alive and in-sync
+ *  'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
   *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
   */
  static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
@@ -3087,6 +3186,8 @@ static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
                 return "-";
         else if (test_bit(Faulty, &rdev->flags))
                 return "D";
+       else if (test_bit(Journal, &rdev->flags))
+               return "A";
         else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
                 return "a";
         else
@@ -3155,7 +3256,8 @@ static sector_t rs_get_progress(struct raid_set *rs,
                          * being initialized.
                          */
                         rdev_for_each(rdev, mddev)
-                               if (!test_bit(In_sync, &rdev->flags))
+                               if (!test_bit(Journal, &rdev->flags) &&
+                                   !test_bit(In_sync, &rdev->flags))
                                         *array_in_sync = true;
  #if 0
                         r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
@@ -3255,6 +3357,12 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                  * so retrieving it from the first raid disk is sufficient.
                  */
                 DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
+
+               /*
+                * v1.10.0+:
+                */
+               DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
+                             __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
                 break;
  
         case STATUSTYPE_TABLE:
@@ -3268,7 +3376,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                 raid_param_cnt += rebuild_disks * 2 +
                                   write_mostly_params +
                                   hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
-                                 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
+                                 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
+                                 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
                 /* Emit table line */
                 DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
                 if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
@@ -3315,6 +3424,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                 if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
                         DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
                                          mddev->sync_speed_min);
+               if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
+                       DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
+                                       __get_dev_name(rs->journal_dev.dev));
                 DMEMIT(" %d", rs->raid_disks);
                 for (i = 0; i < rs->raid_disks; i++)
                         DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3432,6 +3544,10 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
  
         for (i = 0; i < mddev->raid_disks; i++) {
                 r = &rs->dev[i].rdev;
+               /* HM FIXME: enhance journal device recovery processing */
+               if (test_bit(Journal, &r->flags))
+                       continue;
+
                 if (test_bit(Faulty, &r->flags) && r->sb_page &&
                     sync_page_io(r, 0, r->sb_size, r->sb_page,
                                  REQ_OP_READ, 0, true)) {
@@ -3480,6 +3596,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
                 uint64_t failed_devices[DISKS_ARRAY_ELEMS];
  
                 rdev_for_each(r, &rs->md) {
+                       if (test_bit(Journal, &r->flags))
+                               continue;
+
                         sb = page_address(r->sb_page);
                         sb_retrieve_failed_devices(sb, failed_devices);
  
@@ -3658,7 +3777,7 @@ static void raid_resume(struct dm_target *ti)
  
  static struct target_type raid_target = {
         .name = "raid",
-       .version = {1, 9, 2},
+       .version = {1, 10, 0},
         .module = THIS_MODULE,
         .ctr = raid_ctr,
         .dtr = raid_dtr,
author	Heinz Mauelshagen <heinzm@redhat.com>
	Wed, 30 Nov 2016 21:31:05 +0000 (22:31 +0100)
committer	Mike Snitzer <snitzer@redhat.com>
	Wed, 25 Jan 2017 11:49:06 +0000 (12:49 +0100)
Documentation/device-mapper/dm-raid.txt		patch \| blob \| history
drivers/md/dm-raid.c		patch \| blob \| history