#include "locking.h"
#include "compat.h"
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+ struct rb_node rb_node;
+ /* objectid */
+ u64 ino;
+ /*
+ * transid where the defrag was added, we search for
+ * extents newer than this
+ */
+ u64 transid;
+
+ /* root objectid */
+ u64 root;
+
+ /* last offset we were able to defrag */
+ u64 last_offset;
+
+ /* if we've wrapped around back to zero once already */
+ int cycled;
+};
+
+/* pop a record for an inode into the defrag tree. The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode_defrag *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+
+ p = &root->fs_info->defrag_inodes.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ if (defrag->ino < entry->ino)
+ p = &parent->rb_left;
+ else if (defrag->ino > entry->ino)
+ p = &parent->rb_right;
+ else {
+ /* if we're reinserting an entry for
+ * an old defrag run, make sure to
+ * lower the transid of our existing record
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ if (defrag->last_offset > entry->last_offset)
+ entry->last_offset = defrag->last_offset;
+ goto exists;
+ }
+ }
+ BTRFS_I(inode)->in_defrag = 1;
+ rb_link_node(&defrag->rb_node, parent, p);
+ rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+ return 0;
+
+exists:
+ kfree(defrag);
+ return 0;
+
+}
+
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode_defrag *defrag;
+ int ret = 0;
+ u64 transid;
+
+ if (!btrfs_test_opt(root, AUTO_DEFRAG))
+ return 0;
+
+ if (root->fs_info->closing)
+ return 0;
+
+ if (BTRFS_I(inode)->in_defrag)
+ return 0;
+
+ if (trans)
+ transid = trans->transid;
+ else
+ transid = BTRFS_I(inode)->root->last_trans;
+
+ defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+ if (!defrag)
+ return -ENOMEM;
+
+ defrag->ino = inode->i_ino;
+ defrag->transid = transid;
+ defrag->root = root->root_key.objectid;
+
+ spin_lock(&root->fs_info->defrag_inodes_lock);
+ if (!BTRFS_I(inode)->in_defrag)
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ spin_unlock(&root->fs_info->defrag_inodes_lock);
+ return ret;
+}
+
+/*
+ * must be called with the defrag_inodes lock held
+ */
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+ struct rb_node **next)
+{
+ struct inode_defrag *entry = NULL;
+ struct rb_node *p;
+ struct rb_node *parent = NULL;
+
+ p = info->defrag_inodes.rb_node;
+ while (p) {
+ parent = p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ if (ino < entry->ino)
+ p = parent->rb_left;
+ else if (ino > entry->ino)
+ p = parent->rb_right;
+ else
+ return entry;
+ }
+
+ if (next) {
+ while (parent && ino > entry->ino) {
+ parent = rb_next(parent);
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+ }
+ *next = parent;
+ }
+ return NULL;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ struct btrfs_root *inode_root;
+ struct inode *inode;
+ struct rb_node *n;
+ struct btrfs_key key;
+ struct btrfs_ioctl_defrag_range_args range;
+ u64 first_ino = 0;
+ int num_defrag;
+ int defrag_batch = 1024;
+
+ memset(&range, 0, sizeof(range));
+ range.len = (u64)-1;
+
+ atomic_inc(&fs_info->defrag_running);
+ spin_lock(&fs_info->defrag_inodes_lock);
+ while(1) {
+ n = NULL;
+
+ /* find an inode to defrag */
+ defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+ if (!defrag) {
+ if (n)
+ defrag = rb_entry(n, struct inode_defrag, rb_node);
+ else if (first_ino) {
+ first_ino = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ /* remove it from the rbtree */
+ first_ino = defrag->ino + 1;
+ rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
+
+ if (fs_info->closing)
+ goto next_free;
+
+ spin_unlock(&fs_info->defrag_inodes_lock);
+
+ /* get the inode */
+ key.objectid = defrag->root;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+ inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(inode_root))
+ goto next;
+
+ key.objectid = defrag->ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ if (IS_ERR(inode))
+ goto next;
+
+ /* do a chunk of defrag */
+ BTRFS_I(inode)->in_defrag = 0;
+ range.start = defrag->last_offset;
+ num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ defrag_batch);
+ /*
+ * if we filled the whole defrag batch, there
+ * must be more work to do. Queue this defrag
+ * again
+ */
+ if (num_defrag == defrag_batch) {
+ defrag->last_offset = range.start;
+ __btrfs_add_inode_defrag(inode, defrag);
+ /*
+ * we don't want to kfree defrag, we added it back to
+ * the rbtree
+ */
+ defrag = NULL;
+ } else if (defrag->last_offset && !defrag->cycled) {
+ /*
+ * we didn't fill our defrag batch, but
+ * we didn't start at zero. Make sure we loop
+ * around to the start of the file.
+ */
+ defrag->last_offset = 0;
+ defrag->cycled = 1;
+ __btrfs_add_inode_defrag(inode, defrag);
+ defrag = NULL;
+ }
+
+ iput(inode);
+next:
+ spin_lock(&fs_info->defrag_inodes_lock);
+next_free:
+ kfree(defrag);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+
+ atomic_dec(&fs_info->defrag_running);
+
+ /*
+ * during unmount, we use the transaction_wait queue to
+ * wait for the defragger to stop
+ */
+ wake_up(&fs_info->transaction_wait);
+ return 0;
+}
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
return error;
}
+/*
+ * When we're defragging a range, we don't want to kick it off again
+ * if it is really just waiting for delalloc to send it down.
+ * If we find a nice big extent or delalloc range for the bytes in the
+ * file you want to defrag, we return 0 to let you know to skip this
+ * part of the file
+ */
+static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+{
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 end;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+ read_unlock(&em_tree->lock);
+
+ if (em) {
+ end = extent_map_end(em);
+ free_extent_map(em);
+ if (end - offset > thresh)
+ return 0;
+ }
+ /* if we already have a nice delalloc here, just stop */
+ thresh /= 2;
+ end = count_range_bits(io_tree, &offset, offset + thresh,
+ thresh, EXTENT_DELALLOC, 1);
+ if (end >= thresh)
+ return 0;
+ return 1;
+}
+
+/*
+ * helper function to walk through a file and find extents
+ * newer than a specific transid, and smaller than thresh.
+ *
+ * This is used by the defragging code to find new and small
+ * extents
+ */
+static int find_new_extents(struct btrfs_root *root,
+ struct inode *inode, u64 newer_than,
+ u64 *off, int thresh)
+{
+ struct btrfs_path *path;
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *extent;
+ int type;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ min_key.objectid = inode->i_ino;
+ min_key.type = BTRFS_EXTENT_DATA_KEY;
+ min_key.offset = *off;
+
+ max_key.objectid = inode->i_ino;
+ max_key.type = (u8)-1;
+ max_key.offset = (u64)-1;
+
+ path->keep_locks = 1;
+
+ while(1) {
+ ret = btrfs_search_forward(root, &min_key, &max_key,
+ path, 0, newer_than);
+ if (ret != 0)
+ goto none;
+ if (min_key.objectid != inode->i_ino)
+ goto none;
+ if (min_key.type != BTRFS_EXTENT_DATA_KEY)
+ goto none;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG &&
+ btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
+ check_defrag_in_cache(inode, min_key.offset, thresh)) {
+ *off = min_key.offset;
+ btrfs_free_path(path);
+ return 0;
+ }
+
+ if (min_key.offset == (u64)-1)
+ goto none;
+
+ min_key.offset++;
+ btrfs_release_path(path);
+ }
+none:
+ btrfs_free_path(path);
+ return -ENOENT;
+}
+
static int should_defrag_range(struct inode *inode, u64 start, u64 len,
int thresh, u64 *last_len, u64 *skip,
u64 *defrag_end)
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 1;
-
- if (thresh == 0)
- thresh = 256 * 1024;
-
/*
* make sure that once we start defragging and extent, we keep on
* defragging it
return ret;
}
-static int btrfs_defrag_file(struct file *file,
- struct btrfs_ioctl_defrag_range_args *range)
+/*
+ * it doesn't do much good to defrag one or two pages
+ * at a time. This pulls in a nice chunk of pages
+ * to COW and defrag.
+ *
+ * It also makes sure the delalloc code has enough
+ * dirty data to avoid making new small extents as part
+ * of the defrag
+ *
+ * It's a good idea to start RA on this range
+ * before calling this.
+ */
+static int cluster_pages_for_defrag(struct inode *inode,
+ struct page **pages,
+ unsigned long start_index,
+ int num_pages)
{
- struct inode *inode = fdentry(file)->d_inode;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ unsigned long file_end;
+ u64 isize = i_size_read(inode);
+ u64 page_start;
+ u64 page_end;
+ int ret;
+ int i;
+ int i_done;
struct btrfs_ordered_extent *ordered;
- struct page *page;
+ struct extent_state *cached_state = NULL;
+
+ if (isize == 0)
+ return 0;
+ file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+
+ ret = btrfs_delalloc_reserve_space(inode,
+ num_pages << PAGE_CACHE_SHIFT);
+ if (ret)
+ return ret;
+again:
+ ret = 0;
+ i_done = 0;
+
+ /* step one, lock all the pages */
+ for (i = 0; i < num_pages; i++) {
+ struct page *page;
+ page = grab_cache_page(inode->i_mapping,
+ start_index + i);
+ if (!page)
+ break;
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EIO;
+ break;
+ }
+ }
+ isize = i_size_read(inode);
+ file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (!isize || page->index > file_end ||
+ page->mapping != inode->i_mapping) {
+ /* whoops, we blew past eof, skip this page */
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+ pages[i] = page;
+ i_done++;
+ }
+ if (!i_done || ret)
+ goto out;
+
+ if (!(inode->i_sb->s_flags & MS_ACTIVE))
+ goto out;
+
+ /*
+ * so now we have a nice long stream of locked
+ * and up to date pages, lets wait on them
+ */
+ for (i = 0; i < i_done; i++)
+ wait_on_page_writeback(pages[i]);
+
+ page_start = page_offset(pages[0]);
+ page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1, 0, &cached_state,
+ GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > page_start &&
+ ordered->file_offset < page_end) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1,
+ &cached_state, GFP_NOFS);
+ for (i = 0; i < i_done; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_wait_ordered_range(inode, page_start,
+ page_end - page_start);
+ goto again;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
+ page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+ GFP_NOFS);
+
+ if (i_done != num_pages) {
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ btrfs_delalloc_release_space(inode,
+ (num_pages - i_done) << PAGE_CACHE_SHIFT);
+ }
+
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
+ &cached_state);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1, &cached_state,
+ GFP_NOFS);
+
+ for (i = 0; i < i_done; i++) {
+ clear_page_dirty_for_io(pages[i]);
+ ClearPageChecked(pages[i]);
+ set_page_extent_mapped(pages[i]);
+ set_page_dirty(pages[i]);
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ return i_done;
+out:
+ for (i = 0; i < i_done; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
+ return ret;
+
+}
+
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_super_block *disk_super;
+ struct file_ra_state *ra = NULL;
unsigned long last_index;
- unsigned long ra_pages = root->fs_info->bdi.ra_pages;
- unsigned long total_read = 0;
u64 features;
- u64 page_start;
- u64 page_end;
u64 last_len = 0;
u64 skip = 0;
u64 defrag_end = 0;
+ u64 newer_off = range->start;
+ int newer_left = 0;
unsigned long i;
int ret;
+ int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int extent_thresh = range->extent_thresh;
+ int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ u64 new_align = ~((u64)128 * 1024 - 1);
+ struct page **pages = NULL;
+
+ if (extent_thresh == 0)
+ extent_thresh = 256 * 1024;
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
if (range->compress_type > BTRFS_COMPRESS_TYPES)
if (inode->i_size == 0)
return 0;
+ /*
+ * if we were not given a file, allocate a readahead
+ * context
+ */
+ if (!file) {
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+ file_ra_state_init(ra, inode->i_mapping);
+ } else {
+ ra = &file->f_ra;
+ }
+
+ pages = kmalloc(sizeof(struct page *) * newer_cluster,
+ GFP_NOFS);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out_ra;
+ }
+
+ /* find the last page to defrag */
if (range->start + range->len > range->start) {
last_index = min_t(u64, inode->i_size - 1,
range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
}
- i = range->start >> PAGE_CACHE_SHIFT;
- while (i <= last_index) {
- if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+ if (newer_than) {
+ ret = find_new_extents(root, inode, newer_than,
+ &newer_off, 64 * 1024);
+ if (!ret) {
+ range->start = newer_off;
+ /*
+ * we always align our defrag to help keep
+ * the extents in the file evenly spaced
+ */
+ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ newer_left = newer_cluster;
+ } else
+ goto out_ra;
+ } else {
+ i = range->start >> PAGE_CACHE_SHIFT;
+ }
+ if (!max_to_defrag)
+ max_to_defrag = last_index - 1;
+
+ while (i <= last_index && defrag_count < max_to_defrag) {
+ /*
+ * make sure we stop running if someone unmounts
+ * the FS
+ */
+ if (!(inode->i_sb->s_flags & MS_ACTIVE))
+ break;
+
+ if (!newer_than &&
+ !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
PAGE_CACHE_SIZE,
- range->extent_thresh,
+ extent_thresh,
&last_len, &skip,
&defrag_end)) {
unsigned long next;
i = max(i + 1, next);
continue;
}
-
- if (total_read % ra_pages == 0) {
- btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
- min(last_index, i + ra_pages - 1));
- }
- total_read++;
- mutex_lock(&inode->i_mutex);
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = compress_type;
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
- if (ret)
- goto err_unlock;
-again:
- if (inode->i_size == 0 ||
- i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
- ret = 0;
- goto err_reservations;
- }
+ btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
- page = grab_cache_page(inode->i_mapping, i);
- if (!page) {
- ret = -ENOMEM;
- goto err_reservations;
- }
-
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- page_cache_release(page);
- ret = -EIO;
- goto err_reservations;
- }
- }
-
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- page_cache_release(page);
- goto again;
- }
-
- wait_on_page_writeback(page);
+ ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+ if (ret < 0)
+ goto out_ra;
- if (PageDirty(page)) {
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
- goto loop_unlock;
- }
+ defrag_count += ret;
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+ i += ret;
- page_start = (u64)page->index << PAGE_CACHE_SHIFT;
- page_end = page_start + PAGE_CACHE_SIZE - 1;
- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ if (newer_than) {
+ if (newer_off == (u64)-1)
+ break;
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
- unlock_page(page);
- page_cache_release(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- goto again;
+ newer_off = max(newer_off + 1,
+ (u64)i << PAGE_CACHE_SHIFT);
+
+ ret = find_new_extents(root, inode,
+ newer_than, &newer_off,
+ 64 * 1024);
+ if (!ret) {
+ range->start = newer_off;
+ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ newer_left = newer_cluster;
+ } else {
+ break;
+ }
+ } else {
+ i++;
}
- set_page_extent_mapped(page);
-
- /*
- * this makes sure page_mkwrite is called on the
- * page if it is dirtied again later
- */
- clear_page_dirty_for_io(page);
- clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
- page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, GFP_NOFS);
-
- btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
- ClearPageChecked(page);
- set_page_dirty(page);
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-loop_unlock:
- unlock_page(page);
- page_cache_release(page);
- mutex_unlock(&inode->i_mutex);
-
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
- i++;
}
if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
btrfs_set_super_incompat_flags(disk_super, features);
}
- return 0;
+ if (!file)
+ kfree(ra);
+ return defrag_count;
-err_reservations:
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-err_unlock:
- mutex_unlock(&inode->i_mutex);
+out_ra:
+ if (!file)
+ kfree(ra);
+ kfree(pages);
return ret;
}
/* the rest are all set to zero by kzalloc */
range->len = (u64)-1;
}
- ret = btrfs_defrag_file(file, range);
+ ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
+ range, 0, 0);
+ if (ret > 0)
+ ret = 0;
kfree(range);
break;
default: