Btrfs: introduce per-subvolume delalloc inode list
authorMiao Xie <miaox@cn.fujitsu.com>
Wed, 15 May 2013 07:48:22 +0000 (07:48 +0000)
committerJosef Bacik <jbacik@fusionio.com>
Fri, 14 Jun 2013 15:29:40 +0000 (11:29 -0400)
When we create a snapshot, we need flush all delalloc inodes in the
fs, just flushing the inodes in the source tree is OK. So we introduce
per-subvolume delalloc inode list.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
fs/btrfs/ctree.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/inode.c
fs/btrfs/relocation.c
fs/btrfs/transaction.c

index 91a8ca7af77ed65725168e20e597843bc58f6ce2..43c0735339402289e46c5068c78520c78efe14ce 100644 (file)
@@ -1449,13 +1449,9 @@ struct btrfs_fs_info {
         */
        struct list_head ordered_extents;
 
-       spinlock_t delalloc_lock;
-       /*
-        * all of the inodes that have delalloc bytes.  It is possible for
-        * this list to be empty even when there is still dirty data=ordered
-        * extents waiting to finish IO.
-        */
-       struct list_head delalloc_inodes;
+       spinlock_t delalloc_root_lock;
+       /* all fs/file tree roots that have delalloc inodes. */
+       struct list_head delalloc_roots;
 
        /*
         * there is a pool of worker threads for checksumming during writes
@@ -1747,6 +1743,16 @@ struct btrfs_root {
 
        spinlock_t root_item_lock;
        atomic_t refs;
+
+       spinlock_t delalloc_lock;
+       /*
+        * all of the inodes that have delalloc bytes.  It is possible for
+        * this list to be empty even when there is still dirty data=ordered
+        * extents waiting to finish IO.
+        */
+       struct list_head delalloc_inodes;
+       struct list_head delalloc_root;
+       u64 nr_delalloc_inodes;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -3550,6 +3556,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+                                   int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
index 65241f32d3f8aec282a80d443c7acc677f68726a..2af312b6fb1fb24da5679c1bc2732dd0accfe969 100644 (file)
@@ -470,7 +470,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
-       ret = btrfs_start_delalloc_inodes(root, 0);
+       ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
        if (ret) {
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return ret;
index 90b643e07f3caa8d9146019bc13f8a1988a1d066..2748c7ccdd5154194207b7af0dda5b15fd782226 100644 (file)
@@ -1191,6 +1191,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->objectid = objectid;
        root->last_trans = 0;
        root->highest_objectid = 0;
+       root->nr_delalloc_inodes = 0;
        root->name = NULL;
        root->inode_tree = RB_ROOT;
        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1199,10 +1200,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
+       INIT_LIST_HEAD(&root->delalloc_inodes);
+       INIT_LIST_HEAD(&root->delalloc_root);
        INIT_LIST_HEAD(&root->logged_list[0]);
        INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
+       spin_lock_init(&root->delalloc_lock);
        spin_lock_init(&root->accounting_lock);
        spin_lock_init(&root->log_extents_lock[0]);
        spin_lock_init(&root->log_extents_lock[1]);
@@ -2140,9 +2144,9 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
-       INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+       INIT_LIST_HEAD(&fs_info->delalloc_roots);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
-       spin_lock_init(&fs_info->delalloc_lock);
+       spin_lock_init(&fs_info->delalloc_root_lock);
        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
@@ -3803,24 +3807,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 
        INIT_LIST_HEAD(&splice);
 
-       spin_lock(&root->fs_info->delalloc_lock);
-       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       spin_lock(&root->delalloc_lock);
+       list_splice_init(&root->delalloc_inodes, &splice);
 
        while (!list_empty(&splice)) {
-               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                   delalloc_inodes);
+               btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+                                              delalloc_inodes);
 
                list_del_init(&btrfs_inode->delalloc_inodes);
                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                          &btrfs_inode->runtime_flags);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&root->delalloc_lock);
 
                btrfs_invalidate_inodes(btrfs_inode->root);
 
-               spin_lock(&root->fs_info->delalloc_lock);
+               spin_lock(&root->delalloc_lock);
        }
 
-       spin_unlock(&root->fs_info->delalloc_lock);
+       spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *root;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&fs_info->delalloc_root_lock);
+       list_splice_init(&fs_info->delalloc_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                        delalloc_root);
+               list_del_init(&root->delalloc_root);
+               root = btrfs_grab_fs_root(root);
+               BUG_ON(!root);
+               spin_unlock(&fs_info->delalloc_root_lock);
+
+               btrfs_destroy_delalloc_inodes(root);
+               btrfs_put_fs_root(root);
+
+               spin_lock(&fs_info->delalloc_root_lock);
+       }
+       spin_unlock(&fs_info->delalloc_root_lock);
 }
 
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3974,7 +4003,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                btrfs_destroy_delayed_inodes(root);
                btrfs_assert_delayed_root_empty(root);
 
-               btrfs_destroy_delalloc_inodes(root);
+               btrfs_destroy_all_delalloc_inodes(root->fs_info);
 
                spin_lock(&root->fs_info->trans_lock);
                root->fs_info->running_transaction = NULL;
index 04066c2cc7116ec8345b0ddf6e6f61539272a478..f8ff06834e796bfcb1ed63654224c0ae6ecac28f 100644 (file)
@@ -3899,7 +3899,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                 * the filesystem is readonly(all dirty pages are written to
                 * the disk).
                 */
-               btrfs_start_delalloc_inodes(root, 0);
+               btrfs_start_all_delalloc_inodes(root->fs_info, 0);
                if (!current->journal_info)
                        btrfs_wait_ordered_extents(root, 0);
        }
@@ -5030,14 +5030,14 @@ static int update_block_group(struct btrfs_root *root,
        int factor;
 
        /* block accounting for super block */
-       spin_lock(&info->delalloc_lock);
+       spin_lock(&info->delalloc_root_lock);
        old_val = btrfs_super_bytes_used(info->super_copy);
        if (alloc)
                old_val += num_bytes;
        else
                old_val -= num_bytes;
        btrfs_set_super_bytes_used(info->super_copy, old_val);
-       spin_unlock(&info->delalloc_lock);
+       spin_unlock(&info->delalloc_root_lock);
 
        while (total) {
                cache = btrfs_lookup_block_group(info, bytenr);
index 3817c1e49035f82314a644db7b0f7bec9fce5939..18191f193b47b7f6bf60591baf2eed1081357517 100644 (file)
@@ -1528,6 +1528,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        spin_unlock(&BTRFS_I(inode)->lock);
 }
 
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+                                     struct inode *inode)
+{
+       spin_lock(&root->delalloc_lock);
+       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                             &root->delalloc_inodes);
+               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                       &BTRFS_I(inode)->runtime_flags);
+               root->nr_delalloc_inodes++;
+               if (root->nr_delalloc_inodes == 1) {
+                       spin_lock(&root->fs_info->delalloc_root_lock);
+                       BUG_ON(!list_empty(&root->delalloc_root));
+                       list_add_tail(&root->delalloc_root,
+                                     &root->fs_info->delalloc_roots);
+                       spin_unlock(&root->fs_info->delalloc_root_lock);
+               }
+       }
+       spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+                                    struct inode *inode)
+{
+       spin_lock(&root->delalloc_lock);
+       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                         &BTRFS_I(inode)->runtime_flags);
+               root->nr_delalloc_inodes--;
+               if (!root->nr_delalloc_inodes) {
+                       spin_lock(&root->fs_info->delalloc_root_lock);
+                       BUG_ON(list_empty(&root->delalloc_root));
+                       list_del_init(&root->delalloc_root);
+                       spin_unlock(&root->fs_info->delalloc_root_lock);
+               }
+       }
+       spin_unlock(&root->delalloc_lock);
+}
+
 /*
  * extent_io.c set_bit_hook, used to track delayed allocation
  * bytes in this file, and to maintain the list of inodes that
@@ -1560,16 +1600,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                        &BTRFS_I(inode)->runtime_flags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-                                             &root->fs_info->delalloc_inodes);
-                               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                       &BTRFS_I(inode)->runtime_flags);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-               }
+                                        &BTRFS_I(inode)->runtime_flags))
+                       btrfs_add_delalloc_inodes(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -1612,15 +1644,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                BTRFS_I(inode)->delalloc_bytes -= len;
                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                            &BTRFS_I(inode)->runtime_flags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
-                               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                         &BTRFS_I(inode)->runtime_flags);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-               }
+                            &BTRFS_I(inode)->runtime_flags))
+                       btrfs_del_delalloc_inode(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -8338,7 +8363,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
        struct btrfs_inode *binode;
        struct inode *inode;
@@ -8347,30 +8372,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        struct list_head splice;
        int ret = 0;
 
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
-
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
 
-       spin_lock(&root->fs_info->delalloc_lock);
-       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       spin_lock(&root->delalloc_lock);
+       list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
 
-               list_del_init(&binode->delalloc_inodes);
-
+               list_move_tail(&binode->delalloc_inodes,
+                              &root->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
                if (!inode) {
-                       clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                 &binode->runtime_flags);
+                       cond_resched_lock(&root->delalloc_lock);
                        continue;
                }
-
-               list_add_tail(&binode->delalloc_inodes,
-                             &root->fs_info->delalloc_inodes);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&root->delalloc_lock);
 
                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
                if (unlikely(!work)) {
@@ -8382,16 +8400,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                                   &work->work);
 
                cond_resched();
-               spin_lock(&root->fs_info->delalloc_lock);
+               spin_lock(&root->delalloc_lock);
        }
-       spin_unlock(&root->fs_info->delalloc_lock);
+       spin_unlock(&root->delalloc_lock);
 
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
        }
+       return 0;
+out:
+       list_for_each_entry_safe(work, next, &works, list) {
+               list_del_init(&work->list);
+               btrfs_wait_and_free_delalloc_work(work);
+       }
+
+       if (!list_empty_careful(&splice)) {
+               spin_lock(&root->delalloc_lock);
+               list_splice_tail(&splice, &root->delalloc_inodes);
+               spin_unlock(&root->delalloc_lock);
+       }
+       return ret;
+}
 
-       /* the filemap_flush will queue IO into the worker threads, but
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+       int ret;
+
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       ret = __start_delalloc_inodes(root, delay_iput);
+       /*
+        * the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
         */
@@ -8403,17 +8444,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
-       return 0;
-out:
-       list_for_each_entry_safe(work, next, &works, list) {
-               list_del_init(&work->list);
-               btrfs_wait_and_free_delalloc_work(work);
+       return ret;
+}
+
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+                                   int delay_iput)
+{
+       struct btrfs_root *root;
+       struct list_head splice;
+       int ret;
+
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&fs_info->delalloc_root_lock);
+       list_splice_init(&fs_info->delalloc_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                       delalloc_root);
+               root = btrfs_grab_fs_root(root);
+               BUG_ON(!root);
+               list_move_tail(&root->delalloc_root,
+                              &fs_info->delalloc_roots);
+               spin_unlock(&fs_info->delalloc_root_lock);
+
+               ret = __start_delalloc_inodes(root, delay_iput);
+               btrfs_put_fs_root(root);
+               if (ret)
+                       goto out;
+
+               spin_lock(&fs_info->delalloc_root_lock);
        }
+       spin_unlock(&fs_info->delalloc_root_lock);
 
+       atomic_inc(&fs_info->async_submit_draining);
+       while (atomic_read(&fs_info->nr_async_submits) ||
+             atomic_read(&fs_info->async_delalloc_pages)) {
+               wait_event(fs_info->async_submit_wait,
+                  (atomic_read(&fs_info->nr_async_submits) == 0 &&
+                   atomic_read(&fs_info->async_delalloc_pages) == 0));
+       }
+       atomic_dec(&fs_info->async_submit_draining);
+       return 0;
+out:
        if (!list_empty_careful(&splice)) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_lock(&fs_info->delalloc_root_lock);
+               list_splice_tail(&splice, &fs_info->delalloc_roots);
+               spin_unlock(&fs_info->delalloc_root_lock);
        }
        return ret;
 }
index f46b4cca4fa2ccd8e92ea3d4f788a5a1996e285a..f6e1b54f05d8849d63dc4eaf7cd44b84cec77596 100644 (file)
@@ -4159,7 +4159,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->key.objectid,
               (unsigned long long)rc->block_group->flags);
 
-       ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+       ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
        if (ret < 0) {
                err = ret;
                goto out;
index f157752efc473ffeedfcecaa31e5e8760634c378..4b6311181412362074a16fcd10effee037039753 100644 (file)
@@ -1502,7 +1502,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
        }
 
        if (flush_on_commit || snap_pending) {
-               ret = btrfs_start_delalloc_inodes(root, 1);
+               ret = btrfs_start_all_delalloc_inodes(root->fs_info, 1);
                if (ret)
                        return ret;
                btrfs_wait_ordered_extents(root, 1);