btrfs: introduce an evict flushing state
authorJosef Bacik <josef@toxicpanda.com>
Thu, 1 Aug 2019 22:19:37 +0000 (18:19 -0400)
committerDavid Sterba <dsterba@suse.com>
Mon, 9 Sep 2019 12:59:11 +0000 (14:59 +0200)
We have this weird space flushing loop inside inode.c for evict where
we'll do the normal LIMIT flush, and then commit the transaction and
hope we get our space.  This is super janky, and in fact there's really
nothing stopping us from using FLUSH_ALL except that we run delayed
iputs, which means we could deadlock.  So introduce a new flush state
for eviction that does the normal priority flushing with all of the
states that are safe for eviction.

The nice side-effect of this is that we'll try harder for evictions.
Previously if (for example generic/269) you had a bunch of other
operations happening on the fs you could race with those reservations
when committing the transaction, and eventually miss getting a
reservation for the evict.  With this code we'll have our ticket in
place through the transaction commit, so any pinned bytes will go to our
pending evictions first.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.h
fs/btrfs/inode.c
fs/btrfs/space-info.c

index 4ad4715a794108db298bf277747358e56e408af2..b161224b5a0be1aed90a130ad3a2b680b296868f 100644 (file)
@@ -2536,6 +2536,7 @@ enum btrfs_reserve_flush_enum {
         * case, use FLUSH LIMIT
         */
        BTRFS_RESERVE_FLUSH_LIMIT,
+       BTRFS_RESERVE_FLUSH_EVICT,
        BTRFS_RESERVE_FLUSH_ALL,
 };
 
index 612c25aac15cd2da7a83e99d9ad6453466f440a5..c4116bc588274878e810b53ba64af99e4d408f43 100644 (file)
@@ -5336,59 +5336,50 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       struct btrfs_trans_handle *trans;
        u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
-       int failures = 0;
-
-       for (;;) {
-               struct btrfs_trans_handle *trans;
-               int ret;
-
-               ret = btrfs_block_rsv_refill(root, rsv,
-                                            rsv->size + delayed_refs_extra,
-                                            BTRFS_RESERVE_FLUSH_LIMIT);
-
-               if (ret && ++failures > 2) {
-                       btrfs_warn(fs_info,
-                                  "could not allocate space for a delete; will truncate on mount");
-                       return ERR_PTR(-ENOSPC);
-               }
-
-               /*
-                * Evict can generate a large amount of delayed refs without
-                * having a way to add space back since we exhaust our temporary
-                * block rsv.  We aren't allowed to do FLUSH_ALL in this case
-                * because we could deadlock with so many things in the flushing
-                * code, so we have to try and hold some extra space to
-                * compensate for our delayed ref generation.  If we can't get
-                * that space then we need see if we can steal our minimum from
-                * the global reserve.  We will be ratelimited by the amount of
-                * space we have for the delayed refs rsv, so we'll end up
-                * committing and trying again.
-                */
-               trans = btrfs_join_transaction(root);
-               if (IS_ERR(trans) || !ret) {
-                       if (!IS_ERR(trans)) {
-                               trans->block_rsv = &fs_info->trans_block_rsv;
-                               trans->bytes_reserved = delayed_refs_extra;
-                               btrfs_block_rsv_migrate(rsv, trans->block_rsv,
-                                                       delayed_refs_extra, 1);
-                       }
-                       return trans;
-               }
+       int ret;
 
+       /*
+        * Eviction should be taking place at some place safe because of our
+        * delayed iputs.  However the normal flushing code will run delayed
+        * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
+        *
+        * We reserve the delayed_refs_extra here again because we can't use
+        * btrfs_start_transaction(root, 0) for the same deadlocky reason as
+        * above.  We reserve our extra bit here because we generate a ton of
+        * delayed refs activity by truncating.
+        *
+        * If we cannot make our reservation we'll attempt to steal from the
+        * global reserve, because we really want to be able to free up space.
+        */
+       ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
+                                    BTRFS_RESERVE_FLUSH_EVICT);
+       if (ret) {
                /*
                 * Try to steal from the global reserve if there is space for
                 * it.
                 */
-               if (!btrfs_check_space_for_delayed_refs(fs_info) &&
-                   !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0))
-                       return trans;
+               if (btrfs_check_space_for_delayed_refs(fs_info) ||
+                   btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
+                       btrfs_warn(fs_info,
+                                  "could not allocate space for delete; will truncate on mount");
+                       return ERR_PTR(-ENOSPC);
+               }
+               delayed_refs_extra = 0;
+       }
 
-               /* If not, commit and try again. */
-               ret = btrfs_commit_transaction(trans);
-               if (ret)
-                       return ERR_PTR(ret);
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans))
+               return trans;
+
+       if (delayed_refs_extra) {
+               trans->block_rsv = &fs_info->trans_block_rsv;
+               trans->bytes_reserved = delayed_refs_extra;
+               btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+                                       delayed_refs_extra, 1);
        }
+       return trans;
 }
 
 void btrfs_evict_inode(struct inode *inode)
index 37ec31199675c6e4b4adf0475d77896b8e17cdff..5f8f65599de15fa4facdb48324d79ae345698cd2 100644 (file)
@@ -848,6 +848,17 @@ static const enum btrfs_flush_state priority_flush_states[] = {
        ALLOC_CHUNK,
 };
 
+static const enum btrfs_flush_state evict_flush_states[] = {
+       FLUSH_DELAYED_ITEMS_NR,
+       FLUSH_DELAYED_ITEMS,
+       FLUSH_DELAYED_REFS_NR,
+       FLUSH_DELAYED_REFS,
+       FLUSH_DELALLOC,
+       FLUSH_DELALLOC_WAIT,
+       ALLOC_CHUNK,
+       COMMIT_TRANS,
+};
+
 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
                                struct btrfs_space_info *space_info,
                                struct reserve_ticket *ticket,
@@ -922,12 +933,24 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
        u64 reclaim_bytes = 0;
        int ret;
 
-       if (flush == BTRFS_RESERVE_FLUSH_ALL)
+       switch (flush) {
+       case BTRFS_RESERVE_FLUSH_ALL:
                wait_reserve_ticket(fs_info, space_info, ticket);
-       else
+               break;
+       case BTRFS_RESERVE_FLUSH_LIMIT:
                priority_reclaim_metadata_space(fs_info, space_info, ticket,
                                                priority_flush_states,
                                                ARRAY_SIZE(priority_flush_states));
+               break;
+       case BTRFS_RESERVE_FLUSH_EVICT:
+               priority_reclaim_metadata_space(fs_info, space_info, ticket,
+                                               evict_flush_states,
+                                               ARRAY_SIZE(evict_flush_states));
+               break;
+       default:
+               ASSERT(0);
+               break;
+       }
 
        spin_lock(&space_info->lock);
        ret = ticket->error;