md/raid5: activate raid6 rmw feature

author Markus Stockhausen <stockhausen@collogia.de>

Mon, 15 Dec 2014 01:57:05 +0000 (12:57 +1100)

committer NeilBrown <neilb@suse.de>

Tue, 21 Apr 2015 22:00:42 +0000 (08:00 +1000)
author Markus Stockhausen <stockhausen@collogia.de>
Mon, 15 Dec 2014 01:57:05 +0000 (12:57 +1100)
committer NeilBrown <neilb@suse.de>
Tue, 21 Apr 2015 22:00:42 +0000 (08:00 +1000)
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c

index d05327caf69dbc18532478b122ea9834e67f1fd9..5d355e0c263339b5bd179ad61aad63c9b7efb3a3 100644 (file)
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
  {
         void **srcs;
         int i;
+       int start = -1, stop = disks - 3;
  
         if (submit->scribble)
                 srcs = submit->scribble;
@@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
                 if (blocks[i] == NULL) {
                         BUG_ON(i > disks - 3); /* P or Q can't be zero */
                         srcs[i] = (void*)raid6_empty_zero_page;
-               } else
+               } else {
                         srcs[i] = page_address(blocks[i]) + offset;
+                       if (i < disks - 2) {
+                               stop = i;
+                               if (start == -1)
+                                       start = i;
+                       }
+               }
         }
-       raid6_call.gen_syndrome(disks, len, srcs);
+       if (submit->flags & ASYNC_TX_PQ_XOR_DST) {
+               BUG_ON(!raid6_call.xor_syndrome);
+               if (start >= 0)
+                       raid6_call.xor_syndrome(disks, start, stop, len, srcs);
+       } else
+               raid6_call.gen_syndrome(disks, len, srcs);
         async_tx_sync_epilog(submit);
  }
  
@@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
         if (device)
                 unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);
  
-       if (unmap &&
+       /* XORing P/Q is only implemented in software */
+       if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) &&
             (src_cnt <= dma_maxpq(device, 0) ||
              dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
             is_dma_pq_aligned(device, offset, 0, len)) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 3ae097d50b515c2e9a0ae003a7a2c58f4dba1ca4..c82ce1fd8723d070fdfee347ccee274928b021ab 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
   * destination buffer is recorded in srcs[count] and the Q destination
   * is recorded in srcs[count+1]].
   */
-static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+static int set_syndrome_sources(struct page **srcs,
+                               struct stripe_head *sh,
+                               int srctype)
  {
         int disks = sh->disks;
         int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
@@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
         i = d0_idx;
         do {
                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+               struct r5dev *dev = &sh->dev[i];
  
-               srcs[slot] = sh->dev[i].page;
+               if (i == sh->qd_idx || i == sh->pd_idx ||
+                   (srctype == SYNDROME_SRC_ALL) ||
+                   (srctype == SYNDROME_SRC_WANT_DRAIN &&
+                    test_bit(R5_Wantdrain, &dev->flags)) ||
+                   (srctype == SYNDROME_SRC_WRITTEN &&
+                    dev->written))
+                       srcs[slot] = sh->dev[i].page;
                 i = raid6_next_disk(i, disks);
         } while (i != d0_idx);
  
@@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
         atomic_inc(&sh->count);
  
         if (target == qd_idx) {
-               count = set_syndrome_sources(blocks, sh);
+               count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
                 blocks[count] = NULL; /* regenerating p is not necessary */
                 BUG_ON(blocks[count+1] != dest); /* q should already be set */
                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
                         tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
                                        &submit);
  
-                       count = set_syndrome_sources(blocks, sh);
+                       count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
                         init_async_submit(&submit, ASYNC_TX_FENCE, tx,
                                           ops_complete_compute, sh,
                                           to_addr_conv(sh, percpu, 0));
@@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
  }
  
  static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
-              struct dma_async_tx_descriptor *tx)
+ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
+               struct dma_async_tx_descriptor *tx)
  {
         int disks = sh->disks;
         struct page **xor_srcs = to_addr_page(percpu, 0);
@@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
         return tx;
  }
  
+static struct dma_async_tx_descriptor *
+ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
+               struct dma_async_tx_descriptor *tx)
+{
+       struct page **blocks = to_addr_page(percpu, 0);
+       int count;
+       struct async_submit_ctl submit;
+
+       pr_debug("%s: stripe %llu\n", __func__,
+               (unsigned long long)sh->sector);
+
+       count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+
+       init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
+                         ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
+       tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+
+       return tx;
+}
+
  static struct dma_async_tx_descriptor *
  ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  {
@@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
         int count, i, j = 0;
         struct stripe_head *head_sh = sh;
         int last_stripe;
+       int synflags;
+       unsigned long txflags;
  
         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
  
@@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
  
  again:
         blocks = to_addr_page(percpu, j);
-       count = set_syndrome_sources(blocks, sh);
+
+       if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+               synflags = SYNDROME_SRC_WRITTEN;
+               txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
+       } else {
+               synflags = SYNDROME_SRC_ALL;
+               txflags = ASYNC_TX_ACK;
+       }
+
+       count = set_syndrome_sources(blocks, sh, synflags);
         last_stripe = !head_sh->batch_head ||
                 list_first_entry(&sh->batch_list,
                                  struct stripe_head, batch_list) == head_sh;
  
         if (last_stripe) {
                 atomic_inc(&head_sh->count);
-               init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+               init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
                                   head_sh, to_addr_conv(sh, percpu, j));
         } else
                 init_async_submit(&submit, 0, tx, NULL, NULL,
@@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
                 (unsigned long long)sh->sector, checkp);
  
         BUG_ON(sh->batch_head);
-       count = set_syndrome_sources(srcs, sh);
+       count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
         if (!checkp)
                 srcs[count] = NULL;
  
@@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
                         async_tx_ack(tx);
         }
  
-       if (test_bit(STRIPE_OP_PREXOR, &ops_request))
-               tx = ops_run_prexor(sh, percpu, tx);
+       if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
+               if (level < 6)
+                       tx = ops_run_prexor5(sh, percpu, tx);
+               else
+                       tx = ops_run_prexor6(sh, percpu, tx);
+       }
  
         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
                 tx = ops_run_biodrain(sh, tx);
@@ -2770,7 +2814,7 @@ static void
  schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                          int rcw, int expand)
  {
-       int i, pd_idx = sh->pd_idx, disks = sh->disks;
+       int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
         struct r5conf *conf = sh->raid_conf;
         int level = conf->level;
  
@@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                         if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
                                 atomic_inc(&conf->pending_full_writes);
         } else {
-               BUG_ON(level == 6);
                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+               BUG_ON(level == 6 &&
+                       (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
+                          test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
  
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
-                       if (i == pd_idx)
+                       if (i == pd_idx || i == qd_idx)
                                 continue;
  
                         if (dev->towrite &&
@@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
         int rmw = 0, rcw = 0, i;
         sector_t recovery_cp = conf->mddev->recovery_cp;
  
-       /* RAID6 requires 'rcw' in current implementation.
-        * Otherwise, check whether resync is now happening or should start.
+       /* Check whether resync is now happening or should start.
          * If yes, then the array is dirty (after unclean shutdown or
          * initial creation), so parity in some stripes might be inconsistent.
          * In this case, we need to always do reconstruct-write, to ensure
          * that in case of drive failure or read-error correction, we
          * generate correct data from the parity.
          */
-       if (conf->max_degraded == 2 ||
+       if (conf->rmw_level == PARITY_DISABLE_RMW ||
             (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
              s->failed == 0)) {
                 /* Calculate the real rcw later - for now make it
                  * look like rcw is cheaper
                  */
                 rcw = 1; rmw = 2;
-               pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
-                        conf->max_degraded, (unsigned long long)recovery_cp,
+               pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
+                        conf->rmw_level, (unsigned long long)recovery_cp,
                          (unsigned long long)sh->sector);
         } else for (i = disks; i--; ) {
                 /* would I have to read this buffer for read_modify_write */
                 struct r5dev *dev = &sh->dev[i];
-               if ((dev->towrite || i == sh->pd_idx) &&
+               if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
                     !test_bit(R5_LOCKED, &dev->flags) &&
                     !(test_bit(R5_UPTODATE, &dev->flags) ||
                       test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                 rmw += 2*disks;  /* cannot read it */
                 }
                 /* Would I have to read this buffer for reconstruct_write */
-               if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+               if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+                   i != sh->pd_idx && i != sh->qd_idx &&
                     !test_bit(R5_LOCKED, &dev->flags) &&
                     !(test_bit(R5_UPTODATE, &dev->flags) ||
                     test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
         pr_debug("for sector %llu, rmw=%d rcw=%d\n",
                 (unsigned long long)sh->sector, rmw, rcw);
         set_bit(STRIPE_HANDLE, &sh->state);
-       if (rmw < rcw && rmw > 0) {
+       if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
                 /* prefer read-modify-write, but need to get some data */
                 if (conf->mddev->queue)
                         blk_add_trace_msg(conf->mddev->queue,
@@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                           (unsigned long long)sh->sector, rmw);
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
-                       if ((dev->towrite || i == sh->pd_idx) &&
+                       if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
                             !test_bit(R5_LOCKED, &dev->flags) &&
                             !(test_bit(R5_UPTODATE, &dev->flags) ||
                             test_bit(R5_Wantcompute, &dev->flags)) &&
@@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                         }
                 }
         }
-       if (rcw <= rmw && rcw > 0) {
+       if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
                 /* want reconstruct write, but need to get some data */
                 int qread =0;
                 rcw = 0;
@@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         }
  
         conf->level = mddev->new_level;
-       if (conf->level == 6)
+       if (conf->level == 6) {
                 conf->max_degraded = 2;
-       else
+               if (raid6_call.xor_syndrome)
+                       conf->rmw_level = PARITY_ENABLE_RMW;
+               else
+                       conf->rmw_level = PARITY_DISABLE_RMW;
+       } else {
                 conf->max_degraded = 1;
+               conf->rmw_level = PARITY_ENABLE_RMW;
+       }
         conf->algorithm = mddev->new_layout;
         conf->reshape_progress = mddev->reshape_position;
         if (conf->reshape_progress != MaxSector) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h

index ee65ed844d3f32545d1637822acad5d0a744aadf..57fef9ba36fa7087483c77f5366d8b53df5e2794 100644 (file)
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -355,6 +355,23 @@ enum {
         STRIPE_OP_RECONSTRUCT,
         STRIPE_OP_CHECK,
  };
+
+/*
+ * RAID parity calculation preferences
+ */
+enum {
+       PARITY_DISABLE_RMW = 0,
+       PARITY_ENABLE_RMW,
+};
+
+/*
+ * Pages requested from set_syndrome_sources()
+ */
+enum {
+       SYNDROME_SRC_ALL,
+       SYNDROME_SRC_WANT_DRAIN,
+       SYNDROME_SRC_WRITTEN,
+};
  /*
   * Plugging:
   *
@@ -411,7 +428,7 @@ struct r5conf {
         spinlock_t              hash_locks[NR_STRIPE_HASH_LOCKS];
         struct mddev            *mddev;
         int                     chunk_sectors;
-       int                     level, algorithm;
+       int                     level, algorithm, rmw_level;
         int                     max_degraded;
         int                     raid_disks;
         int                     max_nr_stripes;
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h

index 179b38ffd351323c89c91e90f5ef370e04ebddc7..388574ea38ed9d4aa1d644442672ad883feaaef3 100644 (file)
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -60,12 +60,15 @@ struct dma_chan_ref {
   * dependency chain
   * @ASYNC_TX_FENCE: specify that the next operation in the dependency
   * chain uses this operation's result as an input
+ * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the
+ * input data. Required for rmw case.
   */
  enum async_tx_flags {
         ASYNC_TX_XOR_ZERO_DST    = (1 << 0),
         ASYNC_TX_XOR_DROP_DST    = (1 << 1),
         ASYNC_TX_ACK             = (1 << 2),
         ASYNC_TX_FENCE           = (1 << 3),
+       ASYNC_TX_PQ_XOR_DST      = (1 << 4),
  };
  
  /**
author	Markus Stockhausen <stockhausen@collogia.de>
	Mon, 15 Dec 2014 01:57:05 +0000 (12:57 +1100)
committer	NeilBrown <neilb@suse.de>
	Tue, 21 Apr 2015 22:00:42 +0000 (08:00 +1000)
crypto/async_tx/async_pq.c		patch \| blob \| history
drivers/md/raid5.c		patch \| blob \| history
drivers/md/raid5.h		patch \| blob \| history
include/linux/async_tx.h		patch \| blob \| history