Btrfs: fix barrier flushes
authorChris Mason <chris.mason@oracle.com>
Fri, 18 Nov 2011 20:07:51 +0000 (15:07 -0500)
committerChris Mason <chris.mason@oracle.com>
Sun, 20 Nov 2011 12:21:14 +0000 (07:21 -0500)
When btrfs is writing the super blocks, it send barrier flushes to make
sure writeback caching drives get all the metadata on disk in the
right order.

But, we have two bugs in the way these are sent down.  When doing
full commits (not via the tree log), we are sending the barrier down
before the last super when it should be going down before the first.

In multi-device setups, we should be waiting for the barriers to
complete on all devices before writing any of the supers.

Both of these bugs can cause corruptions on power failures.  We fix it
with some new code to send down empty barriers to all devices before
writing the first super.

Alexandre Oliva found the multi-device bug.  Arne Jansen did the async
barrier loop.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
fs/btrfs/disk-io.c
fs/btrfs/volumes.h

index b6a5c0dd0dd8eac7e3e05731773c39771aa9e87a..48d30138237fe7d62add429a9f4ecdaff4ee123e 100644 (file)
@@ -2573,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,
        int errors = 0;
        u32 crc;
        u64 bytenr;
-       int last_barrier = 0;
 
        if (max_mirrors == 0)
                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
-       /* make sure only the last submit_bh does a barrier */
-       if (do_barriers) {
-               for (i = 0; i < max_mirrors; i++) {
-                       bytenr = btrfs_sb_offset(i);
-                       if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-                           device->total_bytes)
-                               break;
-                       last_barrier = i;
-               }
-       }
-
        for (i = 0; i < max_mirrors; i++) {
                bytenr = btrfs_sb_offset(i);
                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2634,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
 
-               if (i == last_barrier && do_barriers)
-                       ret = submit_bh(WRITE_FLUSH_FUA, bh);
-               else
-                       ret = submit_bh(WRITE_SYNC, bh);
-
+               /*
+                * we fua the first super.  The others we allow
+                * to go down lazy.
+                */
+               ret = submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
        return errors < i ? 0 : -1;
 }
 
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+       if (err) {
+               if (err == -EOPNOTSUPP)
+                       set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+       }
+       if (bio->bi_private)
+               complete(bio->bi_private);
+       bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
+ * sent down.  With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+       struct bio *bio;
+       int ret = 0;
+
+       if (device->nobarriers)
+               return 0;
+
+       if (wait) {
+               bio = device->flush_bio;
+               if (!bio)
+                       return 0;
+
+               wait_for_completion(&device->flush_wait);
+
+               if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+                       printk("btrfs: disabling barriers on dev %s\n",
+                              device->name);
+                       device->nobarriers = 1;
+               }
+               if (!bio_flagged(bio, BIO_UPTODATE)) {
+                       ret = -EIO;
+               }
+
+               /* drop the reference from the wait == 0 run */
+               bio_put(bio);
+               device->flush_bio = NULL;
+
+               return ret;
+       }
+
+       /*
+        * one reference for us, and we leave it for the
+        * caller
+        */
+       device->flush_bio = NULL;;
+       bio = bio_alloc(GFP_NOFS, 0);
+       if (!bio)
+               return -ENOMEM;
+
+       bio->bi_end_io = btrfs_end_empty_barrier;
+       bio->bi_bdev = device->bdev;
+       init_completion(&device->flush_wait);
+       bio->bi_private = &device->flush_wait;
+       device->flush_bio = bio;
+
+       bio_get(bio);
+       submit_bio(WRITE_FLUSH, bio);
+
+       return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+       struct list_head *head;
+       struct btrfs_device *dev;
+       int errors = 0;
+       int ret;
+
+       /* send down all the barriers */
+       head = &info->fs_devices->devices;
+       list_for_each_entry_rcu(dev, head, dev_list) {
+               if (!dev->bdev) {
+                       errors++;
+                       continue;
+               }
+               if (!dev->in_fs_metadata || !dev->writeable)
+                       continue;
+
+               ret = write_dev_flush(dev, 0);
+               if (ret)
+                       errors++;
+       }
+
+       /* wait for all the barriers */
+       list_for_each_entry_rcu(dev, head, dev_list) {
+               if (!dev->bdev) {
+                       errors++;
+                       continue;
+               }
+               if (!dev->in_fs_metadata || !dev->writeable)
+                       continue;
+
+               ret = write_dev_flush(dev, 1);
+               if (ret)
+                       errors++;
+       }
+       if (errors)
+               return -EIO;
+       return 0;
+}
+
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
        struct list_head *head;
@@ -2666,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        head = &root->fs_info->fs_devices->devices;
+
+       if (do_barriers)
+               barrier_all_devices(root->fs_info);
+
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
index ab5b1c49f3529e9e7e112649b98f22d0a923fb35..78f2d4d4f37fe81317395688a8b090b71e53a612 100644 (file)
@@ -100,6 +100,12 @@ struct btrfs_device {
        struct reada_zone *reada_curr_zone;
        struct radix_tree_root reada_zones;
        struct radix_tree_root reada_extents;
+
+       /* for sending down flush barriers */
+       struct bio *flush_bio;
+       struct completion flush_wait;
+       int nobarriers;
+
 };
 
 struct btrfs_fs_devices {