Btrfs: introduce free_extent_buffer_stale
authorJosef Bacik <josef@redhat.com>
Fri, 9 Mar 2012 21:01:49 +0000 (16:01 -0500)
committerChris Mason <chris.mason@oracle.com>
Mon, 26 Mar 2012 20:51:08 +0000 (16:51 -0400)
Because btrfs cow's we can end up with extent buffers that are no longer
necessary just sitting around in memory.  So instead of evicting these pages, we
could end up evicting things we actually care about.  Thus we have
free_extent_buffer_stale for use when we are freeing tree blocks.  This will
make it so that the ref for the eb being in the radix tree is dropped as soon as
possible and then is freed when the refcount hits 0 instead of waiting to be
released by releasepage.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
fs/btrfs/ctree.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h

index 0639a555e16ed1975702ed5509dc9bc1c4dbf490..74c03fb0ca1dd91f0723c54acb18645b1153318e 100644 (file)
@@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 {
        struct extent_buffer *eb;
 
-       rcu_read_lock();
-       eb = rcu_dereference(root->node);
-       extent_buffer_get(eb);
-       rcu_read_unlock();
+       while (1) {
+               rcu_read_lock();
+               eb = rcu_dereference(root->node);
+
+               /*
+                * RCU really hurts here, we could free up the root node because
+                * it was cow'ed but we may not get the new root node yet so do
+                * the inc_not_zero dance and if it doesn't work then
+                * synchronize_rcu and try again.
+                */
+               if (atomic_inc_not_zero(&eb->refs)) {
+                       rcu_read_unlock();
+                       break;
+               }
+               rcu_read_unlock();
+               synchronize_rcu();
+       }
        return eb;
 }
 
@@ -504,7 +517,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
-       free_extent_buffer(buf);
+       free_extent_buffer_stale(buf);
        btrfs_mark_buffer_dirty(cow);
        *cow_ret = cow;
        return 0;
@@ -959,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                root_sub_used(root, mid->len);
                btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                /* once for the root ptr */
-               free_extent_buffer(mid);
+               free_extent_buffer_stale(mid);
                return 0;
        }
        if (btrfs_header_nritems(mid) >
@@ -1016,7 +1029,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                                ret = wret;
                        root_sub_used(root, right->len);
                        btrfs_free_tree_block(trans, root, right, 0, 1, 0);
-                       free_extent_buffer(right);
+                       free_extent_buffer_stale(right);
                        right = NULL;
                } else {
                        struct btrfs_disk_key right_key;
@@ -1056,7 +1069,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        ret = wret;
                root_sub_used(root, mid->len);
                btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
-               free_extent_buffer(mid);
+               free_extent_buffer_stale(mid);
                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
@@ -3781,7 +3794,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
        root_sub_used(root, leaf->len);
 
+       extent_buffer_get(leaf);
        btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+       free_extent_buffer_stale(leaf);
        return 0;
 }
 /*
index bc88649cffb7bf98fc1c0f9085731ec51e4e9667..0ba055e03eb84bb5065d1ff0c4a6abee41d8bb21 100644 (file)
@@ -923,16 +923,8 @@ static int btree_readpage(struct file *file, struct page *page)
 
 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 {
-       struct extent_map_tree *map;
-       struct extent_io_tree *tree;
-       int ret;
-
        if (PageWriteback(page) || PageDirty(page))
                return 0;
-
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
-       map = &BTRFS_I(page->mapping->host)->extent_tree;
-
        /*
         * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
         * slab allocation from alloc_extent_state down the callchain where
@@ -940,11 +932,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
         */
        gfp_flags &= ~GFP_SLAB_BUG_MASK;
 
-       ret = try_release_extent_state(map, tree, page, gfp_flags);
-       if (!ret)
-               return 0;
-
-       return try_release_extent_buffer(tree, page);
+       return try_release_extent_buffer(page, gfp_flags);
 }
 
 static void btree_invalidatepage(struct page *page, unsigned long offset)
index 9b7e7682fda0aa3a363d760c7c613c378627a1cb..1b831ac4c0798c8b181a1ae891ab436e79d19961 100644 (file)
@@ -5018,10 +5018,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
-               } else {
-                       invalidate_mapping_pages(info->btree_inode->i_mapping,
-                            bytenr >> PAGE_CACHE_SHIFT,
-                            (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
 
                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
@@ -6022,6 +6018,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
+       clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
        btrfs_set_lock_blocking(buf);
        btrfs_set_buffer_uptodate(buf);
index 0f74262911bee77941708c8709e3a70e0e9f6dc8..0ce14369920cbada24c2b7c90384e15e8babf693 100644 (file)
@@ -3607,6 +3607,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        list_add(&eb->leak_list, &buffers);
        spin_unlock_irqrestore(&leak_lock, flags);
 #endif
+       spin_lock_init(&eb->refs_lock);
        atomic_set(&eb->refs, 1);
        atomic_set(&eb->pages_reading, 0);
 
@@ -3654,6 +3655,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
                         */
                        if (PagePrivate(page) &&
                            page->private == (unsigned long)eb) {
+                               BUG_ON(PageDirty(page));
+                               BUG_ON(PageWriteback(page));
                                /*
                                 * We need to make sure we haven't be attached
                                 * to a new eb.
@@ -3763,7 +3766,6 @@ again:
                if (!atomic_inc_not_zero(&exists->refs)) {
                        spin_unlock(&tree->buffer_lock);
                        radix_tree_preload_end();
-                       synchronize_rcu();
                        exists = NULL;
                        goto again;
                }
@@ -3772,7 +3774,10 @@ again:
                goto free_eb;
        }
        /* add one reference for the tree */
+       spin_lock(&eb->refs_lock);
        atomic_inc(&eb->refs);
+       set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags);
+       spin_unlock(&eb->refs_lock);
        spin_unlock(&tree->buffer_lock);
        radix_tree_preload_end();
 
@@ -3823,15 +3828,143 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
        return NULL;
 }
 
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+       struct extent_buffer *eb =
+                       container_of(head, struct extent_buffer, rcu_head);
+
+       __free_extent_buffer(eb);
+}
+
+static int extent_buffer_under_io(struct extent_buffer *eb,
+                                 struct page *locked_page)
+{
+       unsigned long num_pages, i;
+
+       num_pages = num_extent_pages(eb->start, eb->len);
+       for (i = 0; i < num_pages; i++) {
+               struct page *page = eb->pages[i];
+               int need_unlock = 0;
+
+               if (!page)
+                       continue;
+
+               if (page != locked_page) {
+                       if (!trylock_page(page))
+                               return 1;
+                       need_unlock = 1;
+               }
+
+               if (PageDirty(page) || PageWriteback(page)) {
+                       if (need_unlock)
+                               unlock_page(page);
+                       return 1;
+               }
+               if (need_unlock)
+                       unlock_page(page);
+       }
+
+       return 0;
+}
+
+/* Expects to have eb->eb_lock already held */
+static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+{
+       WARN_ON(atomic_read(&eb->refs) == 0);
+       if (atomic_dec_and_test(&eb->refs)) {
+               struct extent_io_tree *tree = eb->tree;
+               int ret;
+
+               spin_unlock(&eb->refs_lock);
+
+               might_sleep_if(mask & __GFP_WAIT);
+               ret = clear_extent_bit(tree, eb->start,
+                                      eb->start + eb->len - 1, -1, 0, 0,
+                                      NULL, mask);
+               if (ret < 0) {
+                       unsigned long num_pages, i;
+
+                       num_pages = num_extent_pages(eb->start, eb->len);
+                       /*
+                        * We failed to clear the state bits which likely means
+                        * ENOMEM, so just re-up the eb ref and continue, we
+                        * will get freed later on via releasepage or something
+                        * else and will be ok.
+                        */
+                       spin_lock(&eb->tree->mapping->private_lock);
+                       spin_lock(&eb->refs_lock);
+                       set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags);
+                       atomic_inc(&eb->refs);
+
+                       /*
+                        * We may have started to reclaim the pages for a newly
+                        * allocated eb, make sure we own all of them again.
+                        */
+                       for (i = 0; i < num_pages; i++) {
+                               struct page *page = eb->pages[i];
+
+                               if (!page) {
+                                       WARN_ON(1);
+                                       continue;
+                               }
+
+                               BUG_ON(!PagePrivate(page));
+                               if (page->private != (unsigned long)eb) {
+                                       ClearPagePrivate(page);
+                                       page_cache_release(page);
+                                       attach_extent_buffer_page(eb, page);
+                               }
+                       }
+                       spin_unlock(&eb->refs_lock);
+                       spin_unlock(&eb->tree->mapping->private_lock);
+                       return;
+               }
+
+               spin_lock(&tree->buffer_lock);
+               radix_tree_delete(&tree->buffer,
+                                 eb->start >> PAGE_CACHE_SHIFT);
+               spin_unlock(&tree->buffer_lock);
+
+               /* Should be safe to release our pages at this point */
+               btrfs_release_extent_buffer_page(eb, 0);
+
+               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+               return;
+       }
+       spin_unlock(&eb->refs_lock);
+}
+
 void free_extent_buffer(struct extent_buffer *eb)
 {
        if (!eb)
                return;
 
-       if (!atomic_dec_and_test(&eb->refs))
+       spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) == 2 &&
+           test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+           !extent_buffer_under_io(eb, NULL) &&
+           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               atomic_dec(&eb->refs);
+
+       /*
+        * I know this is terrible, but it's temporary until we stop tracking
+        * the uptodate bits and such for the extent buffers.
+        */
+       release_extent_buffer(eb, GFP_ATOMIC);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+       if (!eb)
                return;
 
-       WARN_ON(1);
+       spin_lock(&eb->refs_lock);
+       set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+       if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb, NULL) &&
+           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               atomic_dec(&eb->refs);
+       release_extent_buffer(eb, GFP_NOFS);
 }
 
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
@@ -3874,6 +4007,7 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 
        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
+       WARN_ON(atomic_read(&eb->refs) == 0);
        for (i = 0; i < num_pages; i++)
                __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
        return was_dirty;
@@ -4440,45 +4574,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        }
 }
 
-static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
-{
-       struct extent_buffer *eb =
-                       container_of(head, struct extent_buffer, rcu_head);
-
-       __free_extent_buffer(eb);
-}
-
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+int try_release_extent_buffer(struct page *page, gfp_t mask)
 {
-       u64 start = page_offset(page);
-       struct extent_buffer *eb = (struct extent_buffer *)page->private;
-       int ret = 1;
+       struct extent_buffer *eb;
 
-       if (!PagePrivate(page) || !eb)
+       /*
+        * We need to make sure noboody is attaching this page to an eb right
+        * now.
+        */
+       spin_lock(&page->mapping->private_lock);
+       if (!PagePrivate(page)) {
+               spin_unlock(&page->mapping->private_lock);
                return 1;
+       }
 
-       spin_lock(&tree->buffer_lock);
-       if (atomic_read(&eb->refs) > 1 ||
-           test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-               ret = 0;
-               goto out;
+       eb = (struct extent_buffer *)page->private;
+       BUG_ON(!eb);
+
+       /* 
+        * This is a little awful but should be ok, we need to make sure that
+        * the eb doesn't disappear out from under us while we're looking at
+        * this page.
+        */
+       spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb, page)) {
+               spin_unlock(&eb->refs_lock);
+               spin_unlock(&page->mapping->private_lock);
+               return 0;
        }
+       spin_unlock(&page->mapping->private_lock);
+
+       if ((mask & GFP_NOFS) == GFP_NOFS)
+               mask = GFP_NOFS;
 
        /*
-        * set @eb->refs to 0 if it is already 1, and then release the @eb.
-        * Or go back.
+        * If tree ref isn't set then we know the ref on this eb is a real ref,
+        * so just return, this page will likely be freed soon anyway.
         */
-       if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
-               ret = 0;
-               goto out;
+       if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+               spin_unlock(&eb->refs_lock);
+               return 0;
        }
-       radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-       btrfs_release_extent_buffer_page(eb, 0);
-out:
-       spin_unlock(&tree->buffer_lock);
+       release_extent_buffer(eb, mask);
 
-       /* at this point we can safely release the extent buffer */
-       if (atomic_read(&eb->refs) == 0)
-               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
-       return ret;
+       return 1;
 }
index 83e432da2e26cc367b2de45f756ef1bdaf6575b2..60628341f156661374738b73074be1f6a3bb2a91 100644 (file)
@@ -35,6 +35,8 @@
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
 #define EXTENT_BUFFER_READAHEAD 4      /* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -128,6 +130,7 @@ struct extent_buffer {
        unsigned long map_len;
        unsigned long bflags;
        struct extent_io_tree *tree;
+       spinlock_t refs_lock;
        atomic_t refs;
        atomic_t pages_reading;
        struct list_head leak_list;
@@ -184,7 +187,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 int try_release_extent_mapping(struct extent_map_tree *map,
                               struct extent_io_tree *tree, struct page *page,
                               gfp_t mask);
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_buffer(struct page *page, gfp_t mask);
 int try_release_extent_state(struct extent_map_tree *map,
                             struct extent_io_tree *tree, struct page *page,
                             gfp_t mask);
@@ -261,6 +264,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                         u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE      0
 #define WAIT_COMPLETE  1
 #define WAIT_PAGE_LOCK 2