dm raid1: fix EIO after log failure

author Jonathan Brassow <jbrassow@redhat.com>

Fri, 8 Feb 2008 02:11:35 +0000 (02:11 +0000)

committer Alasdair G Kergon <agk@redhat.com>

Fri, 8 Feb 2008 02:11:35 +0000 (02:11 +0000)
author Jonathan Brassow <jbrassow@redhat.com>
Fri, 8 Feb 2008 02:11:35 +0000 (02:11 +0000)
committer Alasdair G Kergon <agk@redhat.com>
Fri, 8 Feb 2008 02:11:35 +0000 (02:11 +0000)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c

index 9978b9f07fe99db1e9d53dc2605c1d915f5d2206..ec6d675bf766ce036881b4daf1391bda0dc19895 100644 (file)
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -146,6 +146,7 @@ struct mirror_set {
         region_t nr_regions;
         int in_sync;
         int log_failure;
+       atomic_t suspend;
  
         atomic_t default_mirror;        /* Default mirror */
  
@@ -372,6 +373,16 @@ static void complete_resync_work(struct region *reg, int success)
         struct region_hash *rh = reg->rh;
  
         rh->log->type->set_region_sync(rh->log, reg->key, success);
+
+       /*
+        * Dispatch the bios before we call 'wake_up_all'.
+        * This is important because if we are suspending,
+        * we want to know that recovery is complete and
+        * the work queue is flushed.  If we wake_up_all
+        * before we dispatch_bios (queue bios and call wake()),
+        * then we risk suspending before the work queue
+        * has been properly flushed.
+        */
         dispatch_bios(rh->ms, &reg->delayed_bios);
         if (atomic_dec_and_test(&rh->recovery_in_flight))
                 wake_up_all(&_kmirrord_recovery_stopped);
@@ -1069,11 +1080,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
         /*
          * Dispatch io.
          */
-       if (unlikely(ms->log_failure))
+       if (unlikely(ms->log_failure)) {
+               spin_lock_irq(&ms->lock);
+               bio_list_merge(&ms->failures, &sync);
+               spin_unlock_irq(&ms->lock);
+       } else
                 while ((bio = bio_list_pop(&sync)))
-                       bio_endio(bio, -EIO);
-       else while ((bio = bio_list_pop(&sync)))
-               do_write(ms, bio);
+                       do_write(ms, bio);
  
         while ((bio = bio_list_pop(&recover)))
                 rh_delay(&ms->rh, bio);
@@ -1091,8 +1104,46 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
         if (!failures->head)
                 return;
  
-       while ((bio = bio_list_pop(failures)))
-               __bio_mark_nosync(ms, bio, bio->bi_size, 0);
+       if (!ms->log_failure) {
+               while ((bio = bio_list_pop(failures)))
+                       __bio_mark_nosync(ms, bio, bio->bi_size, 0);
+               return;
+       }
+
+       /*
+        * If the log has failed, unattempted writes are being
+        * put on the failures list.  We can't issue those writes
+        * until a log has been marked, so we must store them.
+        *
+        * If a 'noflush' suspend is in progress, we can requeue
+        * the I/O's to the core.  This give userspace a chance
+        * to reconfigure the mirror, at which point the core
+        * will reissue the writes.  If the 'noflush' flag is
+        * not set, we have no choice but to return errors.
+        *
+        * Some writes on the failures list may have been
+        * submitted before the log failure and represent a
+        * failure to write to one of the devices.  It is ok
+        * for us to treat them the same and requeue them
+        * as well.
+        */
+       if (dm_noflush_suspending(ms->ti)) {
+               while ((bio = bio_list_pop(failures)))
+                       bio_endio(bio, DM_ENDIO_REQUEUE);
+               return;
+       }
+
+       if (atomic_read(&ms->suspend)) {
+               while ((bio = bio_list_pop(failures)))
+                       bio_endio(bio, -EIO);
+               return;
+       }
+
+       spin_lock_irq(&ms->lock);
+       bio_list_merge(&ms->failures, failures);
+       spin_unlock_irq(&ms->lock);
+
+       wake(ms);
  }
  
  static void trigger_event(struct work_struct *work)
@@ -1176,6 +1227,8 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
         ms->nr_mirrors = nr_mirrors;
         ms->nr_regions = dm_sector_div_up(ti->len, region_size);
         ms->in_sync = 0;
+       ms->log_failure = 0;
+       atomic_set(&ms->suspend, 0);
         atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
  
         ms->io_client = dm_io_client_create(DM_IO_PAGES);
@@ -1511,26 +1564,51 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
         return 0;
  }
  
-static void mirror_postsuspend(struct dm_target *ti)
+static void mirror_presuspend(struct dm_target *ti)
  {
         struct mirror_set *ms = (struct mirror_set *) ti->private;
         struct dirty_log *log = ms->rh.log;
  
+       atomic_set(&ms->suspend, 1);
+
+       /*
+        * We must finish up all the work that we've
+        * generated (i.e. recovery work).
+        */
         rh_stop_recovery(&ms->rh);
  
-       /* Wait for all I/O we generated to complete */
         wait_event(_kmirrord_recovery_stopped,
                    !atomic_read(&ms->rh.recovery_in_flight));
  
+       if (log->type->presuspend && log->type->presuspend(log))
+               /* FIXME: need better error handling */
+               DMWARN("log presuspend failed");
+
+       /*
+        * Now that recovery is complete/stopped and the
+        * delayed bios are queued, we need to wait for
+        * the worker thread to complete.  This way,
+        * we know that all of our I/O has been pushed.
+        */
+       flush_workqueue(ms->kmirrord_wq);
+}
+
+static void mirror_postsuspend(struct dm_target *ti)
+{
+       struct mirror_set *ms = ti->private;
+       struct dirty_log *log = ms->rh.log;
+
         if (log->type->postsuspend && log->type->postsuspend(log))
                 /* FIXME: need better error handling */
-               DMWARN("log suspend failed");
+               DMWARN("log postsuspend failed");
  }
  
  static void mirror_resume(struct dm_target *ti)
  {
-       struct mirror_set *ms = (struct mirror_set *) ti->private;
+       struct mirror_set *ms = ti->private;
         struct dirty_log *log = ms->rh.log;
+
+       atomic_set(&ms->suspend, 0);
         if (log->type->resume && log->type->resume(log))
                 /* FIXME: need better error handling */
                 DMWARN("log resume failed");
@@ -1564,7 +1642,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
                 DMEMIT("%d", ms->nr_mirrors);
                 for (m = 0; m < ms->nr_mirrors; m++)
                         DMEMIT(" %s %llu", ms->mirror[m].dev->name,
-                               (unsigned long long)ms->mirror[m].offset);
+                              (unsigned long long)ms->mirror[m].offset);
  
                 if (ms->features & DM_RAID1_HANDLE_ERRORS)
                         DMEMIT(" 1 handle_errors");
@@ -1581,6 +1659,7 @@ static struct target_type mirror_target = {
         .dtr     = mirror_dtr,
         .map     = mirror_map,
         .end_io  = mirror_end_io,
+       .presuspend = mirror_presuspend,
         .postsuspend = mirror_postsuspend,
         .resume  = mirror_resume,
         .status  = mirror_status,
author	Jonathan Brassow <jbrassow@redhat.com>
	Fri, 8 Feb 2008 02:11:35 +0000 (02:11 +0000)
committer	Alasdair G Kergon <agk@redhat.com>
	Fri, 8 Feb 2008 02:11:35 +0000 (02:11 +0000)