Btrfs: Use async helpers to deal with pages that have been improperly dirtied
authorChris Mason <chris.mason@oracle.com>
Thu, 17 Jul 2008 16:53:51 +0000 (12:53 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:04 +0000 (11:04 -0400)
Higher layers sometimes call set_page_dirty without asking the filesystem
to help.  This causes many problems for the data=ordered and cow code.
This commit detects pages that haven't been properly setup for IO and
kicks off an async helper to deal with them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/inode.c

index ceebc052ddcb8bbc980c8149a3972025791724c9..4ddc8a8f82cd3911eab3c128a5f86016a44d8686 100644 (file)
@@ -546,6 +546,12 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_workers;
        struct btrfs_workers endio_write_workers;
        struct btrfs_workers submit_workers;
+       /*
+        * fixup workers take dirty pages that didn't properly go through
+        * the cow mechanism and make them safe to write.  It happens
+        * for the sys_munmap function call path
+        */
+       struct btrfs_workers fixup_workers;
        struct task_struct *transaction_kthread;
        struct task_struct *cleaner_kthread;
        int thread_pool_size;
index 4a5ebafb935ae01edeb2cbec6fabab3d3d0e3864..66466d125c0521c9dd8f5b2d22faed94166d8ff6 100644 (file)
@@ -1329,11 +1329,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
         */
        btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
        btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
+       btrfs_init_workers(&fs_info->fixup_workers, 1);
        btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
        btrfs_init_workers(&fs_info->endio_write_workers,
                           fs_info->thread_pool_size);
        btrfs_start_workers(&fs_info->workers, 1);
        btrfs_start_workers(&fs_info->submit_workers, 1);
+       btrfs_start_workers(&fs_info->fixup_workers, 1);
        btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
        btrfs_start_workers(&fs_info->endio_write_workers,
                            fs_info->thread_pool_size);
@@ -1454,6 +1456,7 @@ fail_tree_root:
 fail_sys_array:
 fail_sb_buffer:
        extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
+       btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->workers);
        btrfs_stop_workers(&fs_info->endio_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -1710,6 +1713,7 @@ int close_ctree(struct btrfs_root *root)
 
        truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
+       btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->workers);
        btrfs_stop_workers(&fs_info->endio_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
index 3f82a6e9ca4fa6ddf0dd85554ddfca9077959d59..feff16cb9b40acffbf9a21d1083b71e962f7802f 100644 (file)
@@ -2050,6 +2050,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        lock_extent(tree, start, page_end, GFP_NOFS);
        unlock_start = start;
 
+       if (tree->ops && tree->ops->writepage_start_hook) {
+               ret = tree->ops->writepage_start_hook(page, start, page_end);
+               if (ret == -EAGAIN) {
+                       unlock_extent(tree, start, page_end, GFP_NOFS);
+                       redirty_page_for_writepage(wbc, page);
+                       unlock_page(page);
+                       return 0;
+               }
+       }
+
        end = page_end;
        if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
                printk("found delalloc bits after lock_extent\n");
index 2268a7995896dc1c9ca2937c732fdbaf86c81b2c..23affd27af5e8efc03e3c8dd717a7bdad4719748 100644 (file)
@@ -30,6 +30,7 @@ typedef       int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
                                       struct bio *bio, int mirror_num);
 struct extent_io_ops {
        int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+       int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
        int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
        extent_submit_bio_hook_t *submit_bio_hook;
        int (*merge_bio_hook)(struct page *page, unsigned long offset,
index 12e765f7e0d483c0ccba587d7aaed3fbd3914a60..20928639d173ca5b33e893302e6ba5d0b0aedbe4 100644 (file)
@@ -313,6 +313,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
                for (i = 0; i < num_pages; i++) {
                        struct page *p = pages[i];
                        SetPageUptodate(p);
+                       ClearPageChecked(p);
                        set_page_dirty(p);
                }
        } else {
index c5a62f0b9595ffd5e5e6bd3848b1936ef6d78dcd..47a008c19308ad5cbc03886b95473d05ed48a2a0 100644 (file)
@@ -403,6 +403,87 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+struct btrfs_writepage_fixup {
+       struct page *page;
+       struct btrfs_work work;
+};
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+       struct btrfs_writepage_fixup *fixup;
+       struct btrfs_ordered_extent *ordered;
+       struct page *page;
+       struct inode *inode;
+       u64 page_start;
+       u64 page_end;
+
+       fixup = container_of(work, struct btrfs_writepage_fixup, work);
+       page = fixup->page;
+
+       lock_page(page);
+       if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+               ClearPageChecked(page);
+               goto out_page;
+       }
+
+       inode = page->mapping->host;
+       page_start = page_offset(page);
+       page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+       lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+       ordered = btrfs_lookup_ordered_extent(inode, page_start);
+       if (ordered)
+               goto out;
+
+       set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start, page_end,
+                           GFP_NOFS);
+       ClearPageChecked(page);
+out:
+       unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+out_page:
+       unlock_page(page);
+       page_cache_release(page);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea.  This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * hasn't been properly setup for IO.  We kick off an async process
+ * to fix it up.  The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+       struct inode *inode = page->mapping->host;
+       struct btrfs_writepage_fixup *fixup;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+                            EXTENT_ORDERED, 0);
+       if (ret)
+               return 0;
+
+       if (PageChecked(page))
+               return -EAGAIN;
+
+       fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+       if (!fixup)
+               return -EAGAIN;
+printk("queueing worker to fixup page %lu %Lu\n", inode->i_ino, page_offset(page));
+       SetPageChecked(page);
+       page_cache_get(page);
+       fixup->work.func = btrfs_writepage_fixup_worker;
+       fixup->page = page;
+       btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+       return -EAGAIN;
+}
+
 int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state, int uptodate)
 {
@@ -1263,6 +1344,7 @@ again:
                flush_dcache_page(page);
                kunmap(page);
        }
+       ClearPageChecked(page);
        set_page_dirty(page);
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
@@ -2658,6 +2740,7 @@ again:
                flush_dcache_page(page);
                kunmap(page);
        }
+       ClearPageChecked(page);
        set_page_dirty(page);
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
@@ -3039,15 +3122,6 @@ out_fail:
 
 static int btrfs_set_page_dirty(struct page *page)
 {
-       struct inode *inode = page->mapping->host;
-       u64 page_start = page_offset(page);
-       u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-
-       if (!test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-                           EXTENT_DELALLOC, 0)) {
-printk("inode %lu page %Lu not delalloc\n", inode->i_ino, page_offset(page));
-WARN_ON(1);
-       }
        return __set_page_dirty_nobuffers(page);
 }
 
@@ -3098,6 +3172,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
        .readpage_io_hook = btrfs_readpage_io_hook,
        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
+       .writepage_start_hook = btrfs_writepage_start_hook,
        .readpage_io_failed_hook = btrfs_io_failed_hook,
        .set_bit_hook = btrfs_set_bit_hook,
        .clear_bit_hook = btrfs_clear_bit_hook,