btrfs: scrub: cleanup the remaining nodatasum fixup code
authorQu Wenruo <wqu@suse.com>
Wed, 11 Jul 2018 05:41:22 +0000 (13:41 +0800)
committerDavid Sterba <dsterba@suse.com>
Mon, 6 Aug 2018 11:12:53 +0000 (13:12 +0200)
Remove the remaining code that misused the page cache pages during
device replace and could cause data corruption for compressed nodatasum
extents. Such files do not normally exist but there's a bug that allows
this combination and the corruption was exposed by device replace fixup
code.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/scrub.c

index 1235ad8dd9d7ae836dee80dda8a2ac6d69ac7ade..c4eb9eca13b8b7daa9bec084b5b6ec30845dd5c4 100644 (file)
@@ -188,15 +188,6 @@ struct scrub_ctx {
        refcount_t              refs;
 };
 
-struct scrub_fixup_nodatasum {
-       struct scrub_ctx        *sctx;
-       struct btrfs_device     *dev;
-       u64                     logical;
-       struct btrfs_root       *root;
-       struct btrfs_work       work;
-       int                     mirror_num;
-};
-
 struct scrub_warning {
        struct btrfs_path       *path;
        u64                     extent_item_size;
@@ -215,8 +206,6 @@ struct full_stripe_lock {
 
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
-static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
-static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                                     struct scrub_block *sblocks_for_recheck);
@@ -531,60 +520,6 @@ out:
        return ret;
 }
 
-/*
- * used for workers that require transaction commits (i.e., for the
- * NOCOW case)
- */
-static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
-{
-       struct btrfs_fs_info *fs_info = sctx->fs_info;
-
-       refcount_inc(&sctx->refs);
-       /*
-        * increment scrubs_running to prevent cancel requests from
-        * completing as long as a worker is running. we must also
-        * increment scrubs_paused to prevent deadlocking on pause
-        * requests used for transactions commits (as the worker uses a
-        * transaction context). it is safe to regard the worker
-        * as paused for all matters practical. effectively, we only
-        * avoid cancellation requests from completing.
-        */
-       mutex_lock(&fs_info->scrub_lock);
-       atomic_inc(&fs_info->scrubs_running);
-       atomic_inc(&fs_info->scrubs_paused);
-       mutex_unlock(&fs_info->scrub_lock);
-
-       /*
-        * check if @scrubs_running=@scrubs_paused condition
-        * inside wait_event() is not an atomic operation.
-        * which means we may inc/dec @scrub_running/paused
-        * at any time. Let's wake up @scrub_pause_wait as
-        * much as we can to let commit transaction blocked less.
-        */
-       wake_up(&fs_info->scrub_pause_wait);
-
-       atomic_inc(&sctx->workers_pending);
-}
-
-/* used for workers that require transaction commits */
-static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
-{
-       struct btrfs_fs_info *fs_info = sctx->fs_info;
-
-       /*
-        * see scrub_pending_trans_workers_inc() why we're pretending
-        * to be paused in the scrub counters
-        */
-       mutex_lock(&fs_info->scrub_lock);
-       atomic_dec(&fs_info->scrubs_running);
-       atomic_dec(&fs_info->scrubs_paused);
-       mutex_unlock(&fs_info->scrub_lock);
-       atomic_dec(&sctx->workers_pending);
-       wake_up(&fs_info->scrub_pause_wait);
-       wake_up(&sctx->list_wait);
-       scrub_put_ctx(sctx);
-}
-
 static void scrub_free_csums(struct scrub_ctx *sctx)
 {
        while (!list_empty(&sctx->csum_list)) {
@@ -858,194 +793,6 @@ out:
        btrfs_free_path(path);
 }
 
-static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
-{
-       struct page *page = NULL;
-       unsigned long index;
-       struct scrub_fixup_nodatasum *fixup = fixup_ctx;
-       int ret;
-       int corrected = 0;
-       struct btrfs_key key;
-       struct inode *inode = NULL;
-       struct btrfs_fs_info *fs_info;
-       u64 end = offset + PAGE_SIZE - 1;
-       struct btrfs_root *local_root;
-       int srcu_index;
-
-       key.objectid = root;
-       key.type = BTRFS_ROOT_ITEM_KEY;
-       key.offset = (u64)-1;
-
-       fs_info = fixup->root->fs_info;
-       srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
-
-       local_root = btrfs_read_fs_root_no_name(fs_info, &key);
-       if (IS_ERR(local_root)) {
-               srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
-               return PTR_ERR(local_root);
-       }
-
-       key.type = BTRFS_INODE_ITEM_KEY;
-       key.objectid = inum;
-       key.offset = 0;
-       inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
-       srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
-       if (IS_ERR(inode))
-               return PTR_ERR(inode);
-
-       index = offset >> PAGE_SHIFT;
-
-       page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-       if (!page) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       if (PageUptodate(page)) {
-               if (PageDirty(page)) {
-                       /*
-                        * we need to write the data to the defect sector. the
-                        * data that was in that sector is not in memory,
-                        * because the page was modified. we must not write the
-                        * modified page to that sector.
-                        *
-                        * TODO: what could be done here: wait for the delalloc
-                        *       runner to write out that page (might involve
-                        *       COW) and see whether the sector is still
-                        *       referenced afterwards.
-                        *
-                        * For the meantime, we'll treat this error
-                        * incorrectable, although there is a chance that a
-                        * later scrub will find the bad sector again and that
-                        * there's no dirty page in memory, then.
-                        */
-                       ret = -EIO;
-                       goto out;
-               }
-               ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
-                                       fixup->logical, page,
-                                       offset - page_offset(page),
-                                       fixup->mirror_num);
-               unlock_page(page);
-               corrected = !ret;
-       } else {
-               /*
-                * we need to get good data first. the general readpage path
-                * will call repair_io_failure for us, we just have to make
-                * sure we read the bad mirror.
-                */
-               ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
-                                       EXTENT_DAMAGED);
-               if (ret) {
-                       /* set_extent_bits should give proper error */
-                       WARN_ON(ret > 0);
-                       if (ret > 0)
-                               ret = -EFAULT;
-                       goto out;
-               }
-
-               ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
-                                               btrfs_get_extent,
-                                               fixup->mirror_num);
-               wait_on_page_locked(page);
-
-               corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
-                                               end, EXTENT_DAMAGED, 0, NULL);
-               if (!corrected)
-                       clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
-                                               EXTENT_DAMAGED);
-       }
-
-out:
-       if (page)
-               put_page(page);
-
-       iput(inode);
-
-       if (ret < 0)
-               return ret;
-
-       if (ret == 0 && corrected) {
-               /*
-                * we only need to call readpage for one of the inodes belonging
-                * to this extent. so make iterate_extent_inodes stop
-                */
-               return 1;
-       }
-
-       return -EIO;
-}
-
-static void scrub_fixup_nodatasum(struct btrfs_work *work)
-{
-       struct btrfs_fs_info *fs_info;
-       int ret;
-       struct scrub_fixup_nodatasum *fixup;
-       struct scrub_ctx *sctx;
-       struct btrfs_trans_handle *trans = NULL;
-       struct btrfs_path *path;
-       int uncorrectable = 0;
-
-       fixup = container_of(work, struct scrub_fixup_nodatasum, work);
-       sctx = fixup->sctx;
-       fs_info = fixup->root->fs_info;
-
-       path = btrfs_alloc_path();
-       if (!path) {
-               spin_lock(&sctx->stat_lock);
-               ++sctx->stat.malloc_errors;
-               spin_unlock(&sctx->stat_lock);
-               uncorrectable = 1;
-               goto out;
-       }
-
-       trans = btrfs_join_transaction(fixup->root);
-       if (IS_ERR(trans)) {
-               uncorrectable = 1;
-               goto out;
-       }
-
-       /*
-        * the idea is to trigger a regular read through the standard path. we
-        * read a page from the (failed) logical address by specifying the
-        * corresponding copynum of the failed sector. thus, that readpage is
-        * expected to fail.
-        * that is the point where on-the-fly error correction will kick in
-        * (once it's finished) and rewrite the failed sector if a good copy
-        * can be found.
-        */
-       ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
-                                         scrub_fixup_readpage, fixup, false);
-       if (ret < 0) {
-               uncorrectable = 1;
-               goto out;
-       }
-       WARN_ON(ret != 1);
-
-       spin_lock(&sctx->stat_lock);
-       ++sctx->stat.corrected_errors;
-       spin_unlock(&sctx->stat_lock);
-
-out:
-       if (trans && !IS_ERR(trans))
-               btrfs_end_transaction(trans);
-       if (uncorrectable) {
-               spin_lock(&sctx->stat_lock);
-               ++sctx->stat.uncorrectable_errors;
-               spin_unlock(&sctx->stat_lock);
-               btrfs_dev_replace_stats_inc(
-                       &fs_info->dev_replace.num_uncorrectable_read_errors);
-               btrfs_err_rl_in_rcu(fs_info,
-                   "unable to fixup (nodatasum) error at logical %llu on dev %s",
-                       fixup->logical, rcu_str_deref(fixup->dev->name));
-       }
-
-       btrfs_free_path(path);
-       kfree(fixup);
-
-       scrub_pending_trans_workers_dec(sctx);
-}
-
 static inline void scrub_get_recover(struct scrub_recover *recover)
 {
        refcount_inc(&recover->refs);
@@ -1239,42 +986,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                goto out;
        }
 
-       /*
-        * NOTE: Even for nodatasum case, it's still possible that it's a
-        * compressed data extent, thus scrub_fixup_nodatasum(), which write
-        * inode page cache onto disk, could cause serious data corruption.
-        *
-        * So here we could only read from disk, and hope our recovery could
-        * reach disk before the newer write.
-        */
-       if (0 && !is_metadata && !have_csum) {
-               struct scrub_fixup_nodatasum *fixup_nodatasum;
-
-               WARN_ON(sctx->is_dev_replace);
-
-               /*
-                * !is_metadata and !have_csum, this means that the data
-                * might not be COWed, that it might be modified
-                * concurrently. The general strategy to work on the
-                * commit root does not help in the case when COW is not
-                * used.
-                */
-               fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
-               if (!fixup_nodatasum)
-                       goto did_not_correct_error;
-               fixup_nodatasum->sctx = sctx;
-               fixup_nodatasum->dev = dev;
-               fixup_nodatasum->logical = logical;
-               fixup_nodatasum->root = fs_info->extent_root;
-               fixup_nodatasum->mirror_num = failed_mirror_index + 1;
-               scrub_pending_trans_workers_inc(sctx);
-               btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
-                               scrub_fixup_nodatasum, NULL, NULL);
-               btrfs_queue_work(fs_info->scrub_workers,
-                                &fixup_nodatasum->work);
-               goto out;
-       }
-
        /*
         * now build and submit the bios for the other mirrors, check
         * checksums.