btrfs: convert snapshot/nocow exlcusion to drew lock
authorNikolay Borisov <nborisov@suse.com>
Thu, 30 Jan 2020 12:59:45 +0000 (14:59 +0200)
committerDavid Sterba <dsterba@suse.com>
Mon, 23 Mar 2020 16:01:44 +0000 (17:01 +0100)
This patch removes all haphazard code implementing nocow writers
exclusion from pending snapshot creation and switches to using the drew
lock to ensure this invariant still holds.

'Readers' are snapshot creators from create_snapshot and 'writers' are
nocow writers from buffered write path or btrfs_setsize. This locking
scheme allows for multiple snapshots to happen while any nocow writers
are blocked, since writes to page cache in the nocow path will make
snapshots inconsistent.

So for performance reasons we'd like to have the ability to run multiple
concurrent snapshots and also favors readers in this case. And in case
there aren't pending snapshots (which will be the majority of the cases)
we rely on the percpu's writers counter to avoid cacheline contention.

The main gain from using the drew lock is it's now a lot easier to
reason about the guarantees of the locking scheme and whether there is
some silent breakage lurking.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c

index ab8151247b93162cce6141181285fec77642fbb8..db9e872bcc79591886585814657fb6f2bacfa2cd 100644 (file)
@@ -957,11 +957,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
        return sb->s_fs_info;
 }
 
-struct btrfs_subvolume_writers {
-       struct percpu_counter   counter;
-       wait_queue_head_t       wait;
-};
-
 /*
  * The state of btrfs root
  */
@@ -1133,8 +1128,9 @@ struct btrfs_root {
         * root_item_lock.
         */
        int dedupe_in_progress;
-       struct btrfs_subvolume_writers *subv_writers;
-       atomic_t will_be_snapshotted;
+       /* For exclusion of snapshot creation and nocow writes */
+       struct btrfs_drew_lock snapshot_lock;
+
        atomic_t snapshot_force_cow;
 
        /* For qgroup metadata reserved space */
index 770d469e1d9c74323437074c542a950db09bd36a..06819c41e4f4712b59f383694aa13803c481ac42 100644 (file)
@@ -1104,32 +1104,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
        }
 }
 
-static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
-{
-       struct btrfs_subvolume_writers *writers;
-       int ret;
-
-       writers = kmalloc(sizeof(*writers), GFP_NOFS);
-       if (!writers)
-               return ERR_PTR(-ENOMEM);
-
-       ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
-       if (ret < 0) {
-               kfree(writers);
-               return ERR_PTR(ret);
-       }
-
-       init_waitqueue_head(&writers->wait);
-       return writers;
-}
-
-static void
-btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
-{
-       percpu_counter_destroy(&writers->counter);
-       kfree(writers);
-}
-
 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
                         u64 objectid)
 {
@@ -1178,7 +1152,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
        refcount_set(&root->refs, 1);
-       atomic_set(&root->will_be_snapshotted, 0);
        atomic_set(&root->snapshot_force_cow, 0);
        atomic_set(&root->nr_swapfiles, 0);
        root->log_transid = 0;
@@ -1450,7 +1423,7 @@ alloc_fail:
 static int btrfs_init_fs_root(struct btrfs_root *root)
 {
        int ret;
-       struct btrfs_subvolume_writers *writers;
+       unsigned int nofs_flag;
 
        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1460,12 +1433,15 @@ static int btrfs_init_fs_root(struct btrfs_root *root)
                goto fail;
        }
 
-       writers = btrfs_alloc_subvolume_writers();
-       if (IS_ERR(writers)) {
-               ret = PTR_ERR(writers);
+       /*
+        * We might be called under a transaction (e.g. indirect backref
+        * resolution) which could deadlock if it triggers memory reclaim
+        */
+       nofs_flag = memalloc_nofs_save();
+       ret = btrfs_drew_lock_init(&root->snapshot_lock);
+       memalloc_nofs_restore(nofs_flag);
+       if (ret)
                goto fail;
-       }
-       root->subv_writers = writers;
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                set_bit(BTRFS_ROOT_REF_COWS, &root->state);
@@ -3961,8 +3937,7 @@ void btrfs_free_fs_root(struct btrfs_root *root)
        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        if (root->anon_dev)
                free_anon_bdev(root->anon_dev);
-       if (root->subv_writers)
-               btrfs_free_subvolume_writers(root->subv_writers);
+       btrfs_drew_lock_destroy(&root->snapshot_lock);
        free_extent_buffer(root->node);
        free_extent_buffer(root->commit_root);
        kfree(root->free_ino_ctl);
index 7eef91d6c2b67a8faf71c0e30cbcd0c3b003f278..9dcd70cc3ca32ca95ae548d1f63b74fc3b575c80 100644 (file)
@@ -5740,47 +5740,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
                return bg_ret;
        return dev_ret;
 }
-
-/*
- * btrfs_{start,end}_write_no_snapshotting() are similar to
- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
- * data into the page cache through nocow before the subvolume is snapshoted,
- * but flush the data into disk after the snapshot creation, or to prevent
- * operations while snapshotting is ongoing and that cause the snapshot to be
- * inconsistent (writes followed by expanding truncates for example).
- */
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
-{
-       percpu_counter_dec(&root->subv_writers->counter);
-       cond_wake_up(&root->subv_writers->wait);
-}
-
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
-{
-       if (atomic_read(&root->will_be_snapshotted))
-               return 0;
-
-       percpu_counter_inc(&root->subv_writers->counter);
-       /*
-        * Make sure counter is updated before we check for snapshot creation.
-        */
-       smp_mb();
-       if (atomic_read(&root->will_be_snapshotted)) {
-               btrfs_end_write_no_snapshotting(root);
-               return 0;
-       }
-       return 1;
-}
-
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
-{
-       while (true) {
-               int ret;
-
-               ret = btrfs_start_write_no_snapshotting(root);
-               if (ret)
-                       break;
-               wait_var_event(&root->will_be_snapshotted,
-                              !atomic_read(&root->will_be_snapshotted));
-       }
-}
index fd52ad00b6c8f58d031eddc928086865b8aa98d2..8a974a82be519e3d954a6f939ebc38888d4c65d7 100644 (file)
@@ -1553,8 +1553,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
        u64 num_bytes;
        int ret;
 
-       ret = btrfs_start_write_no_snapshotting(root);
-       if (!ret)
+       if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
                return -EAGAIN;
 
        lockstart = round_down(pos, fs_info->sectorsize);
@@ -1569,7 +1568,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
                        NULL, NULL, NULL);
        if (ret <= 0) {
                ret = 0;
-               btrfs_end_write_no_snapshotting(root);
+               btrfs_drew_write_unlock(&root->snapshot_lock);
        } else {
                *write_bytes = min_t(size_t, *write_bytes ,
                                     num_bytes - pos + lockstart);
@@ -1675,7 +1674,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
                                                data_reserved, pos,
                                                write_bytes);
                        else
-                               btrfs_end_write_no_snapshotting(root);
+                               btrfs_drew_write_unlock(&root->snapshot_lock);
                        break;
                }
 
@@ -1779,7 +1778,7 @@ again:
 
                release_bytes = 0;
                if (only_release_metadata)
-                       btrfs_end_write_no_snapshotting(root);
+                       btrfs_drew_write_unlock(&root->snapshot_lock);
 
                if (only_release_metadata && copied > 0) {
                        lockstart = round_down(pos,
@@ -1808,7 +1807,7 @@ again:
 
        if (release_bytes) {
                if (only_release_metadata) {
-                       btrfs_end_write_no_snapshotting(root);
+                       btrfs_drew_write_unlock(&root->snapshot_lock);
                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
                                        release_bytes, true);
                } else {
index 254f5ea17e40ef819c124bea92f3bc51afec74c2..1e138c83cc6e7246c54dd74d956f43a52e8aca5d 100644 (file)
@@ -4726,16 +4726,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                 * truncation, it must capture all writes that happened before
                 * this truncation.
                 */
-               btrfs_wait_for_snapshot_creation(root);
+               btrfs_drew_write_lock(&root->snapshot_lock);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
                if (ret) {
-                       btrfs_end_write_no_snapshotting(root);
+                       btrfs_drew_write_unlock(&root->snapshot_lock);
                        return ret;
                }
 
                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
-                       btrfs_end_write_no_snapshotting(root);
+                       btrfs_drew_write_unlock(&root->snapshot_lock);
                        return PTR_ERR(trans);
                }
 
@@ -4743,7 +4743,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                btrfs_inode_safe_disk_i_size_write(inode, 0);
                pagecache_isize_extended(inode, oldsize, newsize);
                ret = btrfs_update_inode(trans, root, inode);
-               btrfs_end_write_no_snapshotting(root);
+               btrfs_drew_write_unlock(&root->snapshot_lock);
                btrfs_end_transaction(trans);
        } else {
 
index 56bd3ea7fb67fe7f4c5809aaf9c38405ac8bf464..6ded5e346821b5be8987cc1a03dc67cc5bf3f3c2 100644 (file)
@@ -791,11 +791,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
         * possible. This is to avoid later writeback (running dealloc) to
         * fallback to COW mode and unexpectedly fail with ENOSPC.
         */
-       atomic_inc(&root->will_be_snapshotted);
-       smp_mb__after_atomic();
-       /* wait for no snapshot writes */
-       wait_event(root->subv_writers->wait,
-                  percpu_counter_sum(&root->subv_writers->counter) == 0);
+       btrfs_drew_read_lock(&root->snapshot_lock);
 
        ret = btrfs_start_delalloc_snapshot(root);
        if (ret)
@@ -876,8 +872,8 @@ fail:
 dec_and_free:
        if (snapshot_force_cow)
                atomic_dec(&root->snapshot_force_cow);
-       if (atomic_dec_and_test(&root->will_be_snapshotted))
-               wake_up_var(&root->will_be_snapshotted);
+       btrfs_drew_read_unlock(&root->snapshot_lock);
+
 free_pending:
        kfree(pending_snapshot->root_item);
        btrfs_free_path(pending_snapshot->path);