btrfs: move reserve_metadata_bytes and supporting code to space-info.c
authorJosef Bacik <josef@toxicpanda.com>
Tue, 18 Jun 2019 20:09:25 +0000 (16:09 -0400)
committerDavid Sterba <dsterba@suse.com>
Tue, 2 Jul 2019 10:30:53 +0000 (12:30 +0200)
This moves all of the metadata reservation code into space-info.c.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/extent-tree.c
fs/btrfs/space-info.c
fs/btrfs/space-info.h

index e0f1ec0ca4a427dcccb5288c148274cb95a1b472..c887f335234174a0249fcda8a5e3ed5ce6636485 100644 (file)
@@ -4346,701 +4346,6 @@ out:
        return ret;
 }
 
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
-                                        unsigned long nr_pages, int nr_items)
-{
-       struct super_block *sb = fs_info->sb;
-
-       if (down_read_trylock(&sb->s_umount)) {
-               writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
-               up_read(&sb->s_umount);
-       } else {
-               /*
-                * We needn't worry the filesystem going from r/w to r/o though
-                * we don't acquire ->s_umount mutex, because the filesystem
-                * should guarantee the delalloc inodes list be empty after
-                * the filesystem is readonly(all dirty pages are written to
-                * the disk).
-                */
-               btrfs_start_delalloc_roots(fs_info, nr_items);
-               if (!current->journal_info)
-                       btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
-       }
-}
-
-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
-                                       u64 to_reclaim)
-{
-       u64 bytes;
-       u64 nr;
-
-       bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-       nr = div64_u64(to_reclaim, bytes);
-       if (!nr)
-               nr = 1;
-       return nr;
-}
-
-#define EXTENT_SIZE_PER_ITEM   SZ_256K
-
-/*
- * shrink metadata reservation for delalloc
- */
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
-                           u64 orig, bool wait_ordered)
-{
-       struct btrfs_space_info *space_info;
-       struct btrfs_trans_handle *trans;
-       u64 delalloc_bytes;
-       u64 dio_bytes;
-       u64 async_pages;
-       u64 items;
-       long time_left;
-       unsigned long nr_pages;
-       int loops;
-
-       /* Calc the number of the pages we need flush for space reservation */
-       items = calc_reclaim_items_nr(fs_info, to_reclaim);
-       to_reclaim = items * EXTENT_SIZE_PER_ITEM;
-
-       trans = (struct btrfs_trans_handle *)current->journal_info;
-       space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-
-       delalloc_bytes = percpu_counter_sum_positive(
-                                               &fs_info->delalloc_bytes);
-       dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
-       if (delalloc_bytes == 0 && dio_bytes == 0) {
-               if (trans)
-                       return;
-               if (wait_ordered)
-                       btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
-               return;
-       }
-
-       /*
-        * If we are doing more ordered than delalloc we need to just wait on
-        * ordered extents, otherwise we'll waste time trying to flush delalloc
-        * that likely won't give us the space back we need.
-        */
-       if (dio_bytes > delalloc_bytes)
-               wait_ordered = true;
-
-       loops = 0;
-       while ((delalloc_bytes || dio_bytes) && loops < 3) {
-               nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
-
-               /*
-                * Triggers inode writeback for up to nr_pages. This will invoke
-                * ->writepages callback and trigger delalloc filling
-                *  (btrfs_run_delalloc_range()).
-                */
-               btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
-
-               /*
-                * We need to wait for the compressed pages to start before
-                * we continue.
-                */
-               async_pages = atomic_read(&fs_info->async_delalloc_pages);
-               if (!async_pages)
-                       goto skip_async;
-
-               /*
-                * Calculate how many compressed pages we want to be written
-                * before we continue. I.e if there are more async pages than we
-                * require wait_event will wait until nr_pages are written.
-                */
-               if (async_pages <= nr_pages)
-                       async_pages = 0;
-               else
-                       async_pages -= nr_pages;
-
-               wait_event(fs_info->async_submit_wait,
-                          atomic_read(&fs_info->async_delalloc_pages) <=
-                          (int)async_pages);
-skip_async:
-               spin_lock(&space_info->lock);
-               if (list_empty(&space_info->tickets) &&
-                   list_empty(&space_info->priority_tickets)) {
-                       spin_unlock(&space_info->lock);
-                       break;
-               }
-               spin_unlock(&space_info->lock);
-
-               loops++;
-               if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
-               } else {
-                       time_left = schedule_timeout_killable(1);
-                       if (time_left)
-                               break;
-               }
-               delalloc_bytes = percpu_counter_sum_positive(
-                                               &fs_info->delalloc_bytes);
-               dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
-       }
-}
-
-/**
- * maybe_commit_transaction - possibly commit the transaction if its ok to
- * @root - the root we're allocating for
- * @bytes - the number of bytes we want to reserve
- * @force - force the commit
- *
- * This will check to make sure that committing the transaction will actually
- * get us somewhere and then commit the transaction if it does.  Otherwise it
- * will return -ENOSPC.
- */
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
-                                 struct btrfs_space_info *space_info)
-{
-       struct reserve_ticket *ticket = NULL;
-       struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
-       struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
-       struct btrfs_trans_handle *trans;
-       u64 bytes_needed;
-       u64 reclaim_bytes = 0;
-
-       trans = (struct btrfs_trans_handle *)current->journal_info;
-       if (trans)
-               return -EAGAIN;
-
-       spin_lock(&space_info->lock);
-       if (!list_empty(&space_info->priority_tickets))
-               ticket = list_first_entry(&space_info->priority_tickets,
-                                         struct reserve_ticket, list);
-       else if (!list_empty(&space_info->tickets))
-               ticket = list_first_entry(&space_info->tickets,
-                                         struct reserve_ticket, list);
-       bytes_needed = (ticket) ? ticket->bytes : 0;
-       spin_unlock(&space_info->lock);
-
-       if (!bytes_needed)
-               return 0;
-
-       trans = btrfs_join_transaction(fs_info->extent_root);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
-
-       /*
-        * See if there is enough pinned space to make this reservation, or if
-        * we have block groups that are going to be freed, allowing us to
-        * possibly do a chunk allocation the next loop through.
-        */
-       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
-           __percpu_counter_compare(&space_info->total_bytes_pinned,
-                                    bytes_needed,
-                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
-               goto commit;
-
-       /*
-        * See if there is some space in the delayed insertion reservation for
-        * this reservation.
-        */
-       if (space_info != delayed_rsv->space_info)
-               goto enospc;
-
-       spin_lock(&delayed_rsv->lock);
-       reclaim_bytes += delayed_rsv->reserved;
-       spin_unlock(&delayed_rsv->lock);
-
-       spin_lock(&delayed_refs_rsv->lock);
-       reclaim_bytes += delayed_refs_rsv->reserved;
-       spin_unlock(&delayed_refs_rsv->lock);
-       if (reclaim_bytes >= bytes_needed)
-               goto commit;
-       bytes_needed -= reclaim_bytes;
-
-       if (__percpu_counter_compare(&space_info->total_bytes_pinned,
-                                  bytes_needed,
-                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
-               goto enospc;
-
-commit:
-       return btrfs_commit_transaction(trans);
-enospc:
-       btrfs_end_transaction(trans);
-       return -ENOSPC;
-}
-
-/*
- * Try to flush some data based on policy set by @state. This is only advisory
- * and may fail for various reasons. The caller is supposed to examine the
- * state of @space_info to detect the outcome.
- */
-static void flush_space(struct btrfs_fs_info *fs_info,
-                      struct btrfs_space_info *space_info, u64 num_bytes,
-                      int state)
-{
-       struct btrfs_root *root = fs_info->extent_root;
-       struct btrfs_trans_handle *trans;
-       int nr;
-       int ret = 0;
-
-       switch (state) {
-       case FLUSH_DELAYED_ITEMS_NR:
-       case FLUSH_DELAYED_ITEMS:
-               if (state == FLUSH_DELAYED_ITEMS_NR)
-                       nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
-               else
-                       nr = -1;
-
-               trans = btrfs_join_transaction(root);
-               if (IS_ERR(trans)) {
-                       ret = PTR_ERR(trans);
-                       break;
-               }
-               ret = btrfs_run_delayed_items_nr(trans, nr);
-               btrfs_end_transaction(trans);
-               break;
-       case FLUSH_DELALLOC:
-       case FLUSH_DELALLOC_WAIT:
-               shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
-                               state == FLUSH_DELALLOC_WAIT);
-               break;
-       case FLUSH_DELAYED_REFS_NR:
-       case FLUSH_DELAYED_REFS:
-               trans = btrfs_join_transaction(root);
-               if (IS_ERR(trans)) {
-                       ret = PTR_ERR(trans);
-                       break;
-               }
-               if (state == FLUSH_DELAYED_REFS_NR)
-                       nr = calc_reclaim_items_nr(fs_info, num_bytes);
-               else
-                       nr = 0;
-               btrfs_run_delayed_refs(trans, nr);
-               btrfs_end_transaction(trans);
-               break;
-       case ALLOC_CHUNK:
-       case ALLOC_CHUNK_FORCE:
-               trans = btrfs_join_transaction(root);
-               if (IS_ERR(trans)) {
-                       ret = PTR_ERR(trans);
-                       break;
-               }
-               ret = btrfs_chunk_alloc(trans,
-                               btrfs_metadata_alloc_profile(fs_info),
-                               (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
-                                       CHUNK_ALLOC_FORCE);
-               btrfs_end_transaction(trans);
-               if (ret > 0 || ret == -ENOSPC)
-                       ret = 0;
-               break;
-       case COMMIT_TRANS:
-               /*
-                * If we have pending delayed iputs then we could free up a
-                * bunch of pinned space, so make sure we run the iputs before
-                * we do our pinned bytes check below.
-                */
-               btrfs_run_delayed_iputs(fs_info);
-               btrfs_wait_on_delayed_iputs(fs_info);
-
-               ret = may_commit_transaction(fs_info, space_info);
-               break;
-       default:
-               ret = -ENOSPC;
-               break;
-       }
-
-       trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
-                               ret);
-       return;
-}
-
-static inline u64
-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
-                                struct btrfs_space_info *space_info,
-                                bool system_chunk)
-{
-       struct reserve_ticket *ticket;
-       u64 used;
-       u64 expected;
-       u64 to_reclaim = 0;
-
-       list_for_each_entry(ticket, &space_info->tickets, list)
-               to_reclaim += ticket->bytes;
-       list_for_each_entry(ticket, &space_info->priority_tickets, list)
-               to_reclaim += ticket->bytes;
-       if (to_reclaim)
-               return to_reclaim;
-
-       to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
-       if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
-                                BTRFS_RESERVE_FLUSH_ALL, system_chunk))
-               return 0;
-
-       used = btrfs_space_info_used(space_info, true);
-
-       if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
-                                BTRFS_RESERVE_FLUSH_ALL, system_chunk))
-               expected = div_factor_fine(space_info->total_bytes, 95);
-       else
-               expected = div_factor_fine(space_info->total_bytes, 90);
-
-       if (used > expected)
-               to_reclaim = used - expected;
-       else
-               to_reclaim = 0;
-       to_reclaim = min(to_reclaim, space_info->bytes_may_use +
-                                    space_info->bytes_reserved);
-       return to_reclaim;
-}
-
-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
-                                       struct btrfs_space_info *space_info,
-                                       u64 used, bool system_chunk)
-{
-       u64 thresh = div_factor_fine(space_info->total_bytes, 98);
-
-       /* If we're just plain full then async reclaim just slows us down. */
-       if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
-               return 0;
-
-       if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-                                             system_chunk))
-               return 0;
-
-       return (used >= thresh && !btrfs_fs_closing(fs_info) &&
-               !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
-}
-
-static bool wake_all_tickets(struct list_head *head)
-{
-       struct reserve_ticket *ticket;
-
-       while (!list_empty(head)) {
-               ticket = list_first_entry(head, struct reserve_ticket, list);
-               list_del_init(&ticket->list);
-               ticket->error = -ENOSPC;
-               wake_up(&ticket->wait);
-               if (ticket->bytes != ticket->orig_bytes)
-                       return true;
-       }
-       return false;
-}
-
-/*
- * This is for normal flushers, we can wait all goddamned day if we want to.  We
- * will loop and continuously try to flush as long as we are making progress.
- * We count progress as clearing off tickets each time we have to loop.
- */
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
-{
-       struct btrfs_fs_info *fs_info;
-       struct btrfs_space_info *space_info;
-       u64 to_reclaim;
-       int flush_state;
-       int commit_cycles = 0;
-       u64 last_tickets_id;
-
-       fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
-       space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-
-       spin_lock(&space_info->lock);
-       to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-                                                     false);
-       if (!to_reclaim) {
-               space_info->flush = 0;
-               spin_unlock(&space_info->lock);
-               return;
-       }
-       last_tickets_id = space_info->tickets_id;
-       spin_unlock(&space_info->lock);
-
-       flush_state = FLUSH_DELAYED_ITEMS_NR;
-       do {
-               flush_space(fs_info, space_info, to_reclaim, flush_state);
-               spin_lock(&space_info->lock);
-               if (list_empty(&space_info->tickets)) {
-                       space_info->flush = 0;
-                       spin_unlock(&space_info->lock);
-                       return;
-               }
-               to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
-                                                             space_info,
-                                                             false);
-               if (last_tickets_id == space_info->tickets_id) {
-                       flush_state++;
-               } else {
-                       last_tickets_id = space_info->tickets_id;
-                       flush_state = FLUSH_DELAYED_ITEMS_NR;
-                       if (commit_cycles)
-                               commit_cycles--;
-               }
-
-               /*
-                * We don't want to force a chunk allocation until we've tried
-                * pretty hard to reclaim space.  Think of the case where we
-                * freed up a bunch of space and so have a lot of pinned space
-                * to reclaim.  We would rather use that than possibly create a
-                * underutilized metadata chunk.  So if this is our first run
-                * through the flushing state machine skip ALLOC_CHUNK_FORCE and
-                * commit the transaction.  If nothing has changed the next go
-                * around then we can force a chunk allocation.
-                */
-               if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
-                       flush_state++;
-
-               if (flush_state > COMMIT_TRANS) {
-                       commit_cycles++;
-                       if (commit_cycles > 2) {
-                               if (wake_all_tickets(&space_info->tickets)) {
-                                       flush_state = FLUSH_DELAYED_ITEMS_NR;
-                                       commit_cycles--;
-                               } else {
-                                       space_info->flush = 0;
-                               }
-                       } else {
-                               flush_state = FLUSH_DELAYED_ITEMS_NR;
-                       }
-               }
-               spin_unlock(&space_info->lock);
-       } while (flush_state <= COMMIT_TRANS);
-}
-
-void btrfs_init_async_reclaim_work(struct work_struct *work)
-{
-       INIT_WORK(work, btrfs_async_reclaim_metadata_space);
-}
-
-static const enum btrfs_flush_state priority_flush_states[] = {
-       FLUSH_DELAYED_ITEMS_NR,
-       FLUSH_DELAYED_ITEMS,
-       ALLOC_CHUNK,
-};
-
-static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
-                                           struct btrfs_space_info *space_info,
-                                           struct reserve_ticket *ticket)
-{
-       u64 to_reclaim;
-       int flush_state;
-
-       spin_lock(&space_info->lock);
-       to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
-                                                     false);
-       if (!to_reclaim) {
-               spin_unlock(&space_info->lock);
-               return;
-       }
-       spin_unlock(&space_info->lock);
-
-       flush_state = 0;
-       do {
-               flush_space(fs_info, space_info, to_reclaim,
-                           priority_flush_states[flush_state]);
-               flush_state++;
-               spin_lock(&space_info->lock);
-               if (ticket->bytes == 0) {
-                       spin_unlock(&space_info->lock);
-                       return;
-               }
-               spin_unlock(&space_info->lock);
-       } while (flush_state < ARRAY_SIZE(priority_flush_states));
-}
-
-static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
-                              struct btrfs_space_info *space_info,
-                              struct reserve_ticket *ticket)
-
-{
-       DEFINE_WAIT(wait);
-       u64 reclaim_bytes = 0;
-       int ret = 0;
-
-       spin_lock(&space_info->lock);
-       while (ticket->bytes > 0 && ticket->error == 0) {
-               ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
-               if (ret) {
-                       ret = -EINTR;
-                       break;
-               }
-               spin_unlock(&space_info->lock);
-
-               schedule();
-
-               finish_wait(&ticket->wait, &wait);
-               spin_lock(&space_info->lock);
-       }
-       if (!ret)
-               ret = ticket->error;
-       if (!list_empty(&ticket->list))
-               list_del_init(&ticket->list);
-       if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
-               reclaim_bytes = ticket->orig_bytes - ticket->bytes;
-       spin_unlock(&space_info->lock);
-
-       if (reclaim_bytes)
-               btrfs_space_info_add_old_bytes(fs_info, space_info,
-                                              reclaim_bytes);
-       return ret;
-}
-
-/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @space_info - the space info we want to allocate from
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
- *
- * This will reserve orig_bytes number of bytes from the space info associated
- * with the block_rsv.  If there is not enough space it will make an attempt to
- * flush out space to make room.  It will do this by flushing delalloc if
- * possible or committing the transaction.  If flush is 0 then no attempts to
- * regain reservations will be made and this will fail if there is not enough
- * space already.
- */
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
-                                   struct btrfs_space_info *space_info,
-                                   u64 orig_bytes,
-                                   enum btrfs_reserve_flush_enum flush,
-                                   bool system_chunk)
-{
-       struct reserve_ticket ticket;
-       u64 used;
-       u64 reclaim_bytes = 0;
-       int ret = 0;
-
-       ASSERT(orig_bytes);
-       ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
-
-       spin_lock(&space_info->lock);
-       ret = -ENOSPC;
-       used = btrfs_space_info_used(space_info, true);
-
-       /*
-        * If we have enough space then hooray, make our reservation and carry
-        * on.  If not see if we can overcommit, and if we can, hooray carry on.
-        * If not things get more complicated.
-        */
-       if (used + orig_bytes <= space_info->total_bytes) {
-               btrfs_space_info_update_bytes_may_use(fs_info, space_info,
-                                                     orig_bytes);
-               trace_btrfs_space_reservation(fs_info, "space_info",
-                                             space_info->flags, orig_bytes, 1);
-               ret = 0;
-       } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush,
-                                       system_chunk)) {
-               btrfs_space_info_update_bytes_may_use(fs_info, space_info,
-                                                     orig_bytes);
-               trace_btrfs_space_reservation(fs_info, "space_info",
-                                             space_info->flags, orig_bytes, 1);
-               ret = 0;
-       }
-
-       /*
-        * If we couldn't make a reservation then setup our reservation ticket
-        * and kick the async worker if it's not already running.
-        *
-        * If we are a priority flusher then we just need to add our ticket to
-        * the list and we will do our own flushing further down.
-        */
-       if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
-               ticket.orig_bytes = orig_bytes;
-               ticket.bytes = orig_bytes;
-               ticket.error = 0;
-               init_waitqueue_head(&ticket.wait);
-               if (flush == BTRFS_RESERVE_FLUSH_ALL) {
-                       list_add_tail(&ticket.list, &space_info->tickets);
-                       if (!space_info->flush) {
-                               space_info->flush = 1;
-                               trace_btrfs_trigger_flush(fs_info,
-                                                         space_info->flags,
-                                                         orig_bytes, flush,
-                                                         "enospc");
-                               queue_work(system_unbound_wq,
-                                          &fs_info->async_reclaim_work);
-                       }
-               } else {
-                       list_add_tail(&ticket.list,
-                                     &space_info->priority_tickets);
-               }
-       } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
-               used += orig_bytes;
-               /*
-                * We will do the space reservation dance during log replay,
-                * which means we won't have fs_info->fs_root set, so don't do
-                * the async reclaim as we will panic.
-                */
-               if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
-                   need_do_async_reclaim(fs_info, space_info,
-                                         used, system_chunk) &&
-                   !work_busy(&fs_info->async_reclaim_work)) {
-                       trace_btrfs_trigger_flush(fs_info, space_info->flags,
-                                                 orig_bytes, flush, "preempt");
-                       queue_work(system_unbound_wq,
-                                  &fs_info->async_reclaim_work);
-               }
-       }
-       spin_unlock(&space_info->lock);
-       if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
-               return ret;
-
-       if (flush == BTRFS_RESERVE_FLUSH_ALL)
-               return wait_reserve_ticket(fs_info, space_info, &ticket);
-
-       ret = 0;
-       priority_reclaim_metadata_space(fs_info, space_info, &ticket);
-       spin_lock(&space_info->lock);
-       if (ticket.bytes) {
-               if (ticket.bytes < orig_bytes)
-                       reclaim_bytes = orig_bytes - ticket.bytes;
-               list_del_init(&ticket.list);
-               ret = -ENOSPC;
-       }
-       spin_unlock(&space_info->lock);
-
-       if (reclaim_bytes)
-               btrfs_space_info_add_old_bytes(fs_info, space_info,
-                                              reclaim_bytes);
-       ASSERT(list_empty(&ticket.list));
-       return ret;
-}
-
-/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
- *
- * This will reserve orig_bytes number of bytes from the space info associated
- * with the block_rsv.  If there is not enough space it will make an attempt to
- * flush out space to make room.  It will do this by flushing delalloc if
- * possible or committing the transaction.  If flush is 0 then no attempts to
- * regain reservations will be made and this will fail if there is not enough
- * space already.
- */
-static int reserve_metadata_bytes(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes,
-                                 enum btrfs_reserve_flush_enum flush)
-{
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-       int ret;
-       bool system_chunk = (root == fs_info->chunk_root);
-
-       ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
-                                      orig_bytes, flush, system_chunk);
-       if (ret == -ENOSPC &&
-           unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
-               if (block_rsv != global_rsv &&
-                   !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
-                       ret = 0;
-       }
-       if (ret == -ENOSPC) {
-               trace_btrfs_space_reservation(fs_info, "space_info:enospc",
-                                             block_rsv->space_info->flags,
-                                             orig_bytes, 1);
-
-               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
-                       btrfs_dump_space_info(fs_info, block_rsv->space_info,
-                                             orig_bytes, 0);
-       }
-       return ret;
-}
-
 static struct btrfs_block_rsv *get_block_rsv(
                                        const struct btrfs_trans_handle *trans,
                                        const struct btrfs_root *root)
@@ -5187,8 +4492,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
        if (!num_bytes)
                return 0;
 
-       ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
-                                    num_bytes, flush);
+       ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
+                                          num_bytes, flush);
        if (ret)
                return ret;
        block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -5314,7 +4619,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
        if (num_bytes == 0)
                return 0;
 
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+       ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret)
                block_rsv_add_bytes(block_rsv, num_bytes, true);
 
@@ -5359,7 +4664,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
        if (!ret)
                return 0;
 
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+       ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, false);
                return 0;
@@ -5733,7 +5038,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
        ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
        if (ret)
                goto out_fail;
-       ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+       ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
        if (ret)
                goto out_qgroup;
 
@@ -8102,8 +7407,8 @@ again:
                                "BTRFS: block rsv returned %d\n", ret);
        }
 try_reserve:
-       ret = reserve_metadata_bytes(root, block_rsv, blocksize,
-                                    BTRFS_RESERVE_NO_FLUSH);
+       ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
+                                          BTRFS_RESERVE_NO_FLUSH);
        if (!ret)
                return block_rsv;
        /*
index 41dfb1d4ea86893929ad5b4aa463cd6fcdd36cf1..1ac58d7e7790b59913cd3f68d4a39cee0c0b2a13 100644 (file)
@@ -5,6 +5,9 @@
 #include "sysfs.h"
 #include "volumes.h"
 #include "free-space-cache.h"
+#include "ordered-data.h"
+#include "transaction.h"
+#include "math.h"
 
 u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
                          bool may_use_included)
@@ -401,3 +404,698 @@ again:
                goto again;
        up_read(&info->groups_sem);
 }
+
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
+                                        unsigned long nr_pages, int nr_items)
+{
+       struct super_block *sb = fs_info->sb;
+
+       if (down_read_trylock(&sb->s_umount)) {
+               writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+               up_read(&sb->s_umount);
+       } else {
+               /*
+                * We needn't worry the filesystem going from r/w to r/o though
+                * we don't acquire ->s_umount mutex, because the filesystem
+                * should guarantee the delalloc inodes list be empty after
+                * the filesystem is readonly(all dirty pages are written to
+                * the disk).
+                */
+               btrfs_start_delalloc_roots(fs_info, nr_items);
+               if (!current->journal_info)
+                       btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
+       }
+}
+
+static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+                                       u64 to_reclaim)
+{
+       u64 bytes;
+       u64 nr;
+
+       bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+       nr = div64_u64(to_reclaim, bytes);
+       if (!nr)
+               nr = 1;
+       return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM   SZ_256K
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
+                           u64 orig, bool wait_ordered)
+{
+       struct btrfs_space_info *space_info;
+       struct btrfs_trans_handle *trans;
+       u64 delalloc_bytes;
+       u64 dio_bytes;
+       u64 async_pages;
+       u64 items;
+       long time_left;
+       unsigned long nr_pages;
+       int loops;
+
+       /* Calc the number of the pages we need flush for space reservation */
+       items = calc_reclaim_items_nr(fs_info, to_reclaim);
+       to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+
+       trans = (struct btrfs_trans_handle *)current->journal_info;
+       space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+       delalloc_bytes = percpu_counter_sum_positive(
+                                               &fs_info->delalloc_bytes);
+       dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+       if (delalloc_bytes == 0 && dio_bytes == 0) {
+               if (trans)
+                       return;
+               if (wait_ordered)
+                       btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+               return;
+       }
+
+       /*
+        * If we are doing more ordered than delalloc we need to just wait on
+        * ordered extents, otherwise we'll waste time trying to flush delalloc
+        * that likely won't give us the space back we need.
+        */
+       if (dio_bytes > delalloc_bytes)
+               wait_ordered = true;
+
+       loops = 0;
+       while ((delalloc_bytes || dio_bytes) && loops < 3) {
+               nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+               /*
+                * Triggers inode writeback for up to nr_pages. This will invoke
+                * ->writepages callback and trigger delalloc filling
+                *  (btrfs_run_delalloc_range()).
+                */
+               btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
+               /*
+                * We need to wait for the compressed pages to start before
+                * we continue.
+                */
+               async_pages = atomic_read(&fs_info->async_delalloc_pages);
+               if (!async_pages)
+                       goto skip_async;
+
+               /*
+                * Calculate how many compressed pages we want to be written
+                * before we continue. I.e if there are more async pages than we
+                * require wait_event will wait until nr_pages are written.
+                */
+               if (async_pages <= nr_pages)
+                       async_pages = 0;
+               else
+                       async_pages -= nr_pages;
+
+               wait_event(fs_info->async_submit_wait,
+                          atomic_read(&fs_info->async_delalloc_pages) <=
+                          (int)async_pages);
+skip_async:
+               spin_lock(&space_info->lock);
+               if (list_empty(&space_info->tickets) &&
+                   list_empty(&space_info->priority_tickets)) {
+                       spin_unlock(&space_info->lock);
+                       break;
+               }
+               spin_unlock(&space_info->lock);
+
+               loops++;
+               if (wait_ordered && !trans) {
+                       btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+               } else {
+                       time_left = schedule_timeout_killable(1);
+                       if (time_left)
+                               break;
+               }
+               delalloc_bytes = percpu_counter_sum_positive(
+                                               &fs_info->delalloc_bytes);
+               dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+       }
+}
+
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_space_info *space_info)
+{
+       struct reserve_ticket *ticket = NULL;
+       struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
+       struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+       struct btrfs_trans_handle *trans;
+       u64 bytes_needed;
+       u64 reclaim_bytes = 0;
+
+       trans = (struct btrfs_trans_handle *)current->journal_info;
+       if (trans)
+               return -EAGAIN;
+
+       spin_lock(&space_info->lock);
+       if (!list_empty(&space_info->priority_tickets))
+               ticket = list_first_entry(&space_info->priority_tickets,
+                                         struct reserve_ticket, list);
+       else if (!list_empty(&space_info->tickets))
+               ticket = list_first_entry(&space_info->tickets,
+                                         struct reserve_ticket, list);
+       bytes_needed = (ticket) ? ticket->bytes : 0;
+       spin_unlock(&space_info->lock);
+
+       if (!bytes_needed)
+               return 0;
+
+       trans = btrfs_join_transaction(fs_info->extent_root);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       /*
+        * See if there is enough pinned space to make this reservation, or if
+        * we have block groups that are going to be freed, allowing us to
+        * possibly do a chunk allocation the next loop through.
+        */
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+           __percpu_counter_compare(&space_info->total_bytes_pinned,
+                                    bytes_needed,
+                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+               goto commit;
+
+       /*
+        * See if there is some space in the delayed insertion reservation for
+        * this reservation.
+        */
+       if (space_info != delayed_rsv->space_info)
+               goto enospc;
+
+       spin_lock(&delayed_rsv->lock);
+       reclaim_bytes += delayed_rsv->reserved;
+       spin_unlock(&delayed_rsv->lock);
+
+       spin_lock(&delayed_refs_rsv->lock);
+       reclaim_bytes += delayed_refs_rsv->reserved;
+       spin_unlock(&delayed_refs_rsv->lock);
+       if (reclaim_bytes >= bytes_needed)
+               goto commit;
+       bytes_needed -= reclaim_bytes;
+
+       if (__percpu_counter_compare(&space_info->total_bytes_pinned,
+                                  bytes_needed,
+                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+               goto enospc;
+
+commit:
+       return btrfs_commit_transaction(trans);
+enospc:
+       btrfs_end_transaction(trans);
+       return -ENOSPC;
+}
+
+/*
+ * Try to flush some data based on policy set by @state. This is only advisory
+ * and may fail for various reasons. The caller is supposed to examine the
+ * state of @space_info to detect the outcome.
+ */
+static void flush_space(struct btrfs_fs_info *fs_info,
+                      struct btrfs_space_info *space_info, u64 num_bytes,
+                      int state)
+{
+       struct btrfs_root *root = fs_info->extent_root;
+       struct btrfs_trans_handle *trans;
+       int nr;
+       int ret = 0;
+
+       switch (state) {
+       case FLUSH_DELAYED_ITEMS_NR:
+       case FLUSH_DELAYED_ITEMS:
+               if (state == FLUSH_DELAYED_ITEMS_NR)
+                       nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
+               else
+                       nr = -1;
+
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+               ret = btrfs_run_delayed_items_nr(trans, nr);
+               btrfs_end_transaction(trans);
+               break;
+       case FLUSH_DELALLOC:
+       case FLUSH_DELALLOC_WAIT:
+               shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+                               state == FLUSH_DELALLOC_WAIT);
+               break;
+       case FLUSH_DELAYED_REFS_NR:
+       case FLUSH_DELAYED_REFS:
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+               if (state == FLUSH_DELAYED_REFS_NR)
+                       nr = calc_reclaim_items_nr(fs_info, num_bytes);
+               else
+                       nr = 0;
+               btrfs_run_delayed_refs(trans, nr);
+               btrfs_end_transaction(trans);
+               break;
+       case ALLOC_CHUNK:
+       case ALLOC_CHUNK_FORCE:
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+               ret = btrfs_chunk_alloc(trans,
+                               btrfs_metadata_alloc_profile(fs_info),
+                               (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
+                                       CHUNK_ALLOC_FORCE);
+               btrfs_end_transaction(trans);
+               if (ret > 0 || ret == -ENOSPC)
+                       ret = 0;
+               break;
+       case COMMIT_TRANS:
+               /*
+                * If we have pending delayed iputs then we could free up a
+                * bunch of pinned space, so make sure we run the iputs before
+                * we do our pinned bytes check below.
+                */
+               btrfs_run_delayed_iputs(fs_info);
+               btrfs_wait_on_delayed_iputs(fs_info);
+
+               ret = may_commit_transaction(fs_info, space_info);
+               break;
+       default:
+               ret = -ENOSPC;
+               break;
+       }
+
+       trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
+                               ret);
+       return;
+}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+                                struct btrfs_space_info *space_info,
+                                bool system_chunk)
+{
+       struct reserve_ticket *ticket;
+       u64 used;
+       u64 expected;
+       u64 to_reclaim = 0;
+
+       list_for_each_entry(ticket, &space_info->tickets, list)
+               to_reclaim += ticket->bytes;
+       list_for_each_entry(ticket, &space_info->priority_tickets, list)
+               to_reclaim += ticket->bytes;
+       if (to_reclaim)
+               return to_reclaim;
+
+       to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
+       if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
+                                BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+               return 0;
+
+       used = btrfs_space_info_used(space_info, true);
+
+       if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
+                                BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+               expected = div_factor_fine(space_info->total_bytes, 95);
+       else
+               expected = div_factor_fine(space_info->total_bytes, 90);
+
+       if (used > expected)
+               to_reclaim = used - expected;
+       else
+               to_reclaim = 0;
+       to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+                                    space_info->bytes_reserved);
+       return to_reclaim;
+}
+
+static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
+                                       struct btrfs_space_info *space_info,
+                                       u64 used, bool system_chunk)
+{
+       u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+       /* If we're just plain full then async reclaim just slows us down. */
+       if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
+               return 0;
+
+       if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+                                             system_chunk))
+               return 0;
+
+       return (used >= thresh && !btrfs_fs_closing(fs_info) &&
+               !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+static bool wake_all_tickets(struct list_head *head)
+{
+       struct reserve_ticket *ticket;
+
+       while (!list_empty(head)) {
+               ticket = list_first_entry(head, struct reserve_ticket, list);
+               list_del_init(&ticket->list);
+               ticket->error = -ENOSPC;
+               wake_up(&ticket->wait);
+               if (ticket->bytes != ticket->orig_bytes)
+                       return true;
+       }
+       return false;
+}
+
+/*
+ * This is for normal flushers, we can wait all goddamned day if we want to.  We
+ * will loop and continuously try to flush as long as we are making progress.
+ * We count progress as clearing off tickets each time we have to loop.
+ */
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_space_info *space_info;
+       u64 to_reclaim;
+       int flush_state;
+       int commit_cycles = 0;
+       u64 last_tickets_id;
+
+       fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+       space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+       spin_lock(&space_info->lock);
+       to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+                                                     false);
+       if (!to_reclaim) {
+               space_info->flush = 0;
+               spin_unlock(&space_info->lock);
+               return;
+       }
+       last_tickets_id = space_info->tickets_id;
+       spin_unlock(&space_info->lock);
+
+       flush_state = FLUSH_DELAYED_ITEMS_NR;
+       do {
+               flush_space(fs_info, space_info, to_reclaim, flush_state);
+               spin_lock(&space_info->lock);
+               if (list_empty(&space_info->tickets)) {
+                       space_info->flush = 0;
+                       spin_unlock(&space_info->lock);
+                       return;
+               }
+               to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
+                                                             space_info,
+                                                             false);
+               if (last_tickets_id == space_info->tickets_id) {
+                       flush_state++;
+               } else {
+                       last_tickets_id = space_info->tickets_id;
+                       flush_state = FLUSH_DELAYED_ITEMS_NR;
+                       if (commit_cycles)
+                               commit_cycles--;
+               }
+
+               /*
+                * We don't want to force a chunk allocation until we've tried
+                * pretty hard to reclaim space.  Think of the case where we
+                * freed up a bunch of space and so have a lot of pinned space
+                * to reclaim.  We would rather use that than possibly create a
+                * underutilized metadata chunk.  So if this is our first run
+                * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+                * commit the transaction.  If nothing has changed the next go
+                * around then we can force a chunk allocation.
+                */
+               if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+                       flush_state++;
+
+               if (flush_state > COMMIT_TRANS) {
+                       commit_cycles++;
+                       if (commit_cycles > 2) {
+                               if (wake_all_tickets(&space_info->tickets)) {
+                                       flush_state = FLUSH_DELAYED_ITEMS_NR;
+                                       commit_cycles--;
+                               } else {
+                                       space_info->flush = 0;
+                               }
+                       } else {
+                               flush_state = FLUSH_DELAYED_ITEMS_NR;
+                       }
+               }
+               spin_unlock(&space_info->lock);
+       } while (flush_state <= COMMIT_TRANS);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+       INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+static const enum btrfs_flush_state priority_flush_states[] = {
+       FLUSH_DELAYED_ITEMS_NR,
+       FLUSH_DELAYED_ITEMS,
+       ALLOC_CHUNK,
+};
+
+static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
+                                           struct btrfs_space_info *space_info,
+                                           struct reserve_ticket *ticket)
+{
+       u64 to_reclaim;
+       int flush_state;
+
+       spin_lock(&space_info->lock);
+       to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+                                                     false);
+       if (!to_reclaim) {
+               spin_unlock(&space_info->lock);
+               return;
+       }
+       spin_unlock(&space_info->lock);
+
+       flush_state = 0;
+       do {
+               flush_space(fs_info, space_info, to_reclaim,
+                           priority_flush_states[flush_state]);
+               flush_state++;
+               spin_lock(&space_info->lock);
+               if (ticket->bytes == 0) {
+                       spin_unlock(&space_info->lock);
+                       return;
+               }
+               spin_unlock(&space_info->lock);
+       } while (flush_state < ARRAY_SIZE(priority_flush_states));
+}
+
+static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
+                              struct btrfs_space_info *space_info,
+                              struct reserve_ticket *ticket)
+
+{
+       DEFINE_WAIT(wait);
+       u64 reclaim_bytes = 0;
+       int ret = 0;
+
+       spin_lock(&space_info->lock);
+       while (ticket->bytes > 0 && ticket->error == 0) {
+               ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+               if (ret) {
+                       ret = -EINTR;
+                       break;
+               }
+               spin_unlock(&space_info->lock);
+
+               schedule();
+
+               finish_wait(&ticket->wait, &wait);
+               spin_lock(&space_info->lock);
+       }
+       if (!ret)
+               ret = ticket->error;
+       if (!list_empty(&ticket->list))
+               list_del_init(&ticket->list);
+       if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
+               reclaim_bytes = ticket->orig_bytes - ticket->bytes;
+       spin_unlock(&space_info->lock);
+
+       if (reclaim_bytes)
+               btrfs_space_info_add_old_bytes(fs_info, space_info,
+                                              reclaim_bytes);
+       return ret;
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @space_info - the space info we want to allocate from
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orig_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_space_info *space_info,
+                                   u64 orig_bytes,
+                                   enum btrfs_reserve_flush_enum flush,
+                                   bool system_chunk)
+{
+       struct reserve_ticket ticket;
+       u64 used;
+       u64 reclaim_bytes = 0;
+       int ret = 0;
+
+       ASSERT(orig_bytes);
+       ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+
+       spin_lock(&space_info->lock);
+       ret = -ENOSPC;
+       used = btrfs_space_info_used(space_info, true);
+
+       /*
+        * If we have enough space then hooray, make our reservation and carry
+        * on.  If not see if we can overcommit, and if we can, hooray carry on.
+        * If not things get more complicated.
+        */
+       if (used + orig_bytes <= space_info->total_bytes) {
+               btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+                                                     orig_bytes);
+               trace_btrfs_space_reservation(fs_info, "space_info",
+                                             space_info->flags, orig_bytes, 1);
+               ret = 0;
+       } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush,
+                                       system_chunk)) {
+               btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+                                                     orig_bytes);
+               trace_btrfs_space_reservation(fs_info, "space_info",
+                                             space_info->flags, orig_bytes, 1);
+               ret = 0;
+       }
+
+       /*
+        * If we couldn't make a reservation then setup our reservation ticket
+        * and kick the async worker if it's not already running.
+        *
+        * If we are a priority flusher then we just need to add our ticket to
+        * the list and we will do our own flushing further down.
+        */
+       if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+               ticket.orig_bytes = orig_bytes;
+               ticket.bytes = orig_bytes;
+               ticket.error = 0;
+               init_waitqueue_head(&ticket.wait);
+               if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+                       list_add_tail(&ticket.list, &space_info->tickets);
+                       if (!space_info->flush) {
+                               space_info->flush = 1;
+                               trace_btrfs_trigger_flush(fs_info,
+                                                         space_info->flags,
+                                                         orig_bytes, flush,
+                                                         "enospc");
+                               queue_work(system_unbound_wq,
+                                          &fs_info->async_reclaim_work);
+                       }
+               } else {
+                       list_add_tail(&ticket.list,
+                                     &space_info->priority_tickets);
+               }
+       } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+               used += orig_bytes;
+               /*
+                * We will do the space reservation dance during log replay,
+                * which means we won't have fs_info->fs_root set, so don't do
+                * the async reclaim as we will panic.
+                */
+               if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
+                   need_do_async_reclaim(fs_info, space_info,
+                                         used, system_chunk) &&
+                   !work_busy(&fs_info->async_reclaim_work)) {
+                       trace_btrfs_trigger_flush(fs_info, space_info->flags,
+                                                 orig_bytes, flush, "preempt");
+                       queue_work(system_unbound_wq,
+                                  &fs_info->async_reclaim_work);
+               }
+       }
+       spin_unlock(&space_info->lock);
+       if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+               return ret;
+
+       if (flush == BTRFS_RESERVE_FLUSH_ALL)
+               return wait_reserve_ticket(fs_info, space_info, &ticket);
+
+       ret = 0;
+       priority_reclaim_metadata_space(fs_info, space_info, &ticket);
+       spin_lock(&space_info->lock);
+       if (ticket.bytes) {
+               if (ticket.bytes < orig_bytes)
+                       reclaim_bytes = orig_bytes - ticket.bytes;
+               list_del_init(&ticket.list);
+               ret = -ENOSPC;
+       }
+       spin_unlock(&space_info->lock);
+
+       if (reclaim_bytes)
+               btrfs_space_info_add_old_bytes(fs_info, space_info,
+                                              reclaim_bytes);
+       ASSERT(list_empty(&ticket.list));
+       return ret;
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orig_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 orig_bytes,
+                                enum btrfs_reserve_flush_enum flush)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       int ret;
+       bool system_chunk = (root == fs_info->chunk_root);
+
+       ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
+                                      orig_bytes, flush, system_chunk);
+       if (ret == -ENOSPC &&
+           unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+               if (block_rsv != global_rsv &&
+                   !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
+                       ret = 0;
+       }
+       if (ret == -ENOSPC) {
+               trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+                                             block_rsv->space_info->flags,
+                                             orig_bytes, 1);
+
+               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+                       btrfs_dump_space_info(fs_info, block_rsv->space_info,
+                                             orig_bytes, 0);
+       }
+       return ret;
+}
index d758959d19d58a5a0e9e0c065bbe656aa92e4505..620d390cf6d2ad97dd04af7f95b947079dfec4b3 100644 (file)
@@ -129,5 +129,9 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
                           struct btrfs_space_info *info, u64 bytes,
                           int dump_block_groups);
+int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 orig_bytes,
+                                enum btrfs_reserve_flush_enum flush);
 
 #endif /* BTRFS_SPACE_INFO_H */