Btrfs: change core code of btrfs to support the device replace operations
authorStefan Behrens <sbehrens@giantdisaster.de>
Tue, 6 Nov 2012 12:15:27 +0000 (13:15 +0100)
committerJosef Bacik <jbacik@fusionio.com>
Wed, 12 Dec 2012 22:15:42 +0000 (17:15 -0500)
This commit contains all the essential changes to the core code
of Btrfs for support of the device replace procedure.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
fs/btrfs/disk-io.c
fs/btrfs/reada.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index 0e410478ad27e9b896c1e67e832e0172fb375280..76b82506bf9299305537b4d1b021a3187c8bfbd6 100644 (file)
@@ -45,6 +45,7 @@
 #include "inode-map.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -2438,7 +2439,11 @@ int open_ctree(struct super_block *sb,
                goto fail_tree_roots;
        }
 
-       btrfs_close_extra_devices(fs_devices);
+       /*
+        * keep the device that is marked to be the target device for the
+        * dev_replace procedure
+        */
+       btrfs_close_extra_devices(fs_info, fs_devices, 0);
 
        if (!fs_devices->latest_bdev) {
                printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2510,6 +2515,14 @@ retry_root_backup:
                goto fail_block_groups;
        }
 
+       ret = btrfs_init_dev_replace(fs_info);
+       if (ret) {
+               pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+               goto fail_block_groups;
+       }
+
+       btrfs_close_extra_devices(fs_info, fs_devices, 1);
+
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2658,6 +2671,13 @@ retry_root_backup:
                return ret;
        }
 
+       ret = btrfs_resume_dev_replace_async(fs_info);
+       if (ret) {
+               pr_warn("btrfs: failed to resume dev_replace\n");
+               close_ctree(tree_root);
+               return ret;
+       }
+
        return 0;
 
 fail_qgroup:
@@ -3300,6 +3320,8 @@ int close_ctree(struct btrfs_root *root)
        /* pause restriper - we want to resume on mount */
        btrfs_pause_balance(fs_info);
 
+       btrfs_dev_replace_suspend_for_unmount(fs_info);
+
        btrfs_scrub_cancel(fs_info);
 
        /* wait for any defraggers to finish */
index 9f363e17ec74de0dc69a6346b0a21b8858605073..c705a48e676b28d3b76eb7c34184daa051e8f2eb 100644 (file)
@@ -27,6 +27,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "dev-replace.h"
 
 #undef DEBUG
 
@@ -331,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        int nzones = 0;
        int i;
        unsigned long index = logical >> PAGE_CACHE_SHIFT;
+       int dev_replace_is_ongoing;
 
        spin_lock(&fs_info->reada_lock);
        re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -392,6 +394,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        }
 
        /* insert extent in reada_tree + all per-device trees, all or nothing */
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
        if (ret == -EEXIST) {
@@ -399,13 +402,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                BUG_ON(!re_exist);
                re_exist->refcnt++;
                spin_unlock(&fs_info->reada_lock);
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                goto error;
        }
        if (ret) {
                spin_unlock(&fs_info->reada_lock);
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                goto error;
        }
        prev_dev = NULL;
+       dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+                       &fs_info->dev_replace);
        for (i = 0; i < nzones; ++i) {
                dev = bbio->stripes[i].dev;
                if (dev == prev_dev) {
@@ -422,6 +429,14 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                        /* cannot read ahead on missing device */
                        continue;
                }
+               if (dev_replace_is_ongoing &&
+                   dev == fs_info->dev_replace.tgtdev) {
+                       /*
+                        * as this device is selected for reading only as
+                        * a last resort, skip it for read ahead.
+                        */
+                       continue;
+               }
                prev_dev = dev;
                ret = radix_tree_insert(&dev->reada_extents, index, re);
                if (ret) {
@@ -434,10 +449,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                        BUG_ON(fs_info == NULL);
                        radix_tree_delete(&fs_info->reada_tree, index);
                        spin_unlock(&fs_info->reada_lock);
+                       btrfs_dev_replace_unlock(&fs_info->dev_replace);
                        goto error;
                }
        }
        spin_unlock(&fs_info->reada_lock);
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
 
        kfree(bbio);
        return re;
index 61157a26cf2a6774293d50cd0b53c75742080357..30cbf6921c0baa68aae9a4315f19bfcddb6ec568 100644 (file)
@@ -2843,12 +2843,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                return -EIO;
        }
 
-       if (dev->scrub_device) {
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (dev->scrub_device ||
+           (!is_dev_replace &&
+            btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                mutex_unlock(&fs_info->scrub_lock);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                scrub_workers_put(fs_info);
                return -EINPROGRESS;
        }
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
        sctx = scrub_setup_ctx(dev, is_dev_replace);
        if (IS_ERR(sctx)) {
                mutex_unlock(&fs_info->scrub_lock);
index ad4380684b9bc3673b48605673d3c7b751d7691b..def4f24b58df3be603bbcc3d2b0a1d8f7e21dfe6 100644 (file)
@@ -55,6 +55,7 @@
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
@@ -1225,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                return 0;
 
        if (*flags & MS_RDONLY) {
+               /*
+                * this also happens on 'umount -rf' or on shutdown, when
+                * the filesystem is busy.
+                */
                sb->s_flags |= MS_RDONLY;
 
+               btrfs_dev_replace_suspend_for_unmount(fs_info);
+               btrfs_scrub_cancel(fs_info);
+
                ret = btrfs_commit_super(root);
                if (ret)
                        goto restore;
@@ -1263,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (ret)
                        goto restore;
 
+               ret = btrfs_resume_dev_replace_async(fs_info);
+               if (ret) {
+                       pr_warn("btrfs: failed to resume dev_replace\n");
+                       goto restore;
+               }
                sb->s_flags &= ~MS_RDONLY;
        }
 
index 7b297354e7383a6fa5d73dd46d4cc702c4343cbf..bcc6b65be3b03e22f4f4ffc34286a4861f34954a 100644 (file)
@@ -30,6 +30,7 @@
 #include "tree-log.h"
 #include "inode-map.h"
 #include "volumes.h"
+#include "dev-replace.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
@@ -845,7 +846,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                return ret;
 
        ret = btrfs_run_dev_stats(trans, root->fs_info);
-       BUG_ON(ret);
+       WARN_ON(ret);
+       ret = btrfs_run_dev_replace(trans, root->fs_info);
+       WARN_ON(ret);
 
        ret = btrfs_run_qgroups(trans, root->fs_info);
        BUG_ON(ret);
@@ -868,6 +871,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        switch_commit_root(fs_info->extent_root);
        up_write(&fs_info->extent_commit_sem);
 
+       btrfs_after_dev_replace_commit(fs_info);
+
        return 0;
 }
 
index 5777e6a9aab17f3c0e9b2d5c620ed4817dd3dac6..a4e0963bf4573ea5bc0bfec40ce51502172e96f3 100644 (file)
@@ -36,6 +36,7 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 #include "math.h"
+#include "dev-replace.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -505,7 +506,8 @@ error:
        return ERR_PTR(-ENOMEM);
 }
 
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                              struct btrfs_fs_devices *fs_devices, int step)
 {
        struct btrfs_device *device, *next;
 
@@ -528,6 +530,21 @@ again:
                        continue;
                }
 
+               if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+                       /*
+                        * In the first step, keep the device which has
+                        * the correct fsid and the devid that is used
+                        * for the dev_replace procedure.
+                        * In the second step, the dev_replace state is
+                        * read from the device tree and it is known
+                        * whether the procedure is really active or
+                        * not, which means whether this device is
+                        * used or whether it should be removed.
+                        */
+                       if (step == 0 || device->is_tgtdev_for_dev_replace) {
+                               continue;
+                       }
+               }
                if (device->bdev) {
                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
@@ -536,7 +553,8 @@ again:
                if (device->writeable) {
                        list_del_init(&device->dev_alloc_list);
                        device->writeable = 0;
-                       fs_devices->rw_devices--;
+                       if (!device->is_tgtdev_for_dev_replace)
+                               fs_devices->rw_devices--;
                }
                list_del_init(&device->dev_list);
                fs_devices->num_devices--;
@@ -594,7 +612,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                if (device->bdev)
                        fs_devices->open_devices--;
 
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        list_del_init(&device->dev_alloc_list);
                        fs_devices->rw_devices--;
                }
@@ -718,7 +736,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        fs_devices->rotating = 1;
 
                fs_devices->open_devices++;
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        fs_devices->rw_devices++;
                        list_add(&device->dev_alloc_list,
                                 &fs_devices->alloc_list);
@@ -1350,16 +1368,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                root->fs_info->avail_system_alloc_bits |
                root->fs_info->avail_metadata_alloc_bits;
 
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-           root->fs_info->fs_devices->num_devices <= 4) {
+       num_devices = root->fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+               WARN_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
                printk(KERN_ERR "btrfs: unable to go below four devices "
                       "on raid10\n");
                ret = -EINVAL;
                goto out;
        }
 
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-           root->fs_info->fs_devices->num_devices <= 2) {
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
                printk(KERN_ERR "btrfs: unable to go below two "
                       "devices on raid1\n");
                ret = -EINVAL;
@@ -2935,6 +2959,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        u64 allowed;
        int mixed = 0;
        int ret;
+       u64 num_devices;
 
        if (btrfs_fs_closing(fs_info) ||
            atomic_read(&fs_info->balance_pause_req) ||
@@ -2963,10 +2988,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                }
        }
 
+       num_devices = fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+               BUG_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-       if (fs_info->fs_devices->num_devices == 1)
+       if (num_devices == 1)
                allowed |= BTRFS_BLOCK_GROUP_DUP;
-       else if (fs_info->fs_devices->num_devices < 4)
+       else if (num_devices < 4)
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
        else
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -3591,6 +3623,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                devices_info[ndevs].total_avail = total_avail;
                devices_info[ndevs].dev = device;
                ++ndevs;
+               WARN_ON(ndevs > fs_devices->rw_devices);
        }
 
        /*
@@ -4773,6 +4806,7 @@ static void fill_device_from_item(struct extent_buffer *leaf,
        device->io_align = btrfs_device_io_align(leaf, dev_item);
        device->io_width = btrfs_device_io_width(leaf, dev_item);
        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+       WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
        device->is_tgtdev_for_dev_replace = 0;
 
        ptr = (unsigned long)btrfs_device_uuid(dev_item);
index 58d79375deaf02a6caaa9a04ceccb48c7b358a03..37d0157167b02291bf7921ec537fd7f6b115e131 100644 (file)
@@ -268,7 +268,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                          struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                              struct btrfs_fs_devices *fs_devices, int step);
 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
                                         char *device_path,
                                         struct btrfs_device **device);