Btrfs: fix metadata inconsistencies after directory fsync

author Filipe Manana <fdmanana@suse.com>

Fri, 20 Mar 2015 17:19:46 +0000 (17:19 +0000)

committer Chris Mason <clm@fb.com>

Fri, 27 Mar 2015 00:56:23 +0000 (17:56 -0700)
author Filipe Manana <fdmanana@suse.com>
Fri, 20 Mar 2015 17:19:46 +0000 (17:19 +0000)
committer Chris Mason <clm@fb.com>
Fri, 27 Mar 2015 00:56:23 +0000 (17:56 -0700)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index de5e4f2adfeac9d07ba2539a8098b44cc9781580..0ef5cc13fae26f8899cad7ed30b59344c79120d4 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,7 +66,11 @@ struct btrfs_inode {
          */
         struct btrfs_key location;
  
-       /* Lock for counters */
+       /*
+        * Lock for counters and all fields used to determine if the inode is in
+        * the log or not (last_trans, last_sub_trans, last_log_commit,
+        * logged_trans).
+        */
         spinlock_t lock;
  
         /* the extent_tree has caches of all the extent mappings to disk */
@@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
  
  static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
  {
+       int ret = 0;
+
+       spin_lock(&BTRFS_I(inode)->lock);
         if (BTRFS_I(inode)->logged_trans == generation &&
             BTRFS_I(inode)->last_sub_trans <=
             BTRFS_I(inode)->last_log_commit &&
@@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
                  */
                 smp_mb();
                 if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
-                       return 1;
+                       ret = 1;
         }
-       return 0;
+       spin_unlock(&BTRFS_I(inode)->lock);
+       return ret;
  }
  
  #define BTRFS_DIO_ORIG_BIO_SUBMITTED   0x1
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 150db5e50c2dd7e0836c513af607fba6da65ef2a..fd105c172c8b42e3b889bd3fdf4e7e6665e81033 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1811,7 +1811,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
          * otherwise subsequent syncs to a file that's been synced in this
          * transaction will appear to have already occured.
          */
+       spin_lock(&BTRFS_I(inode)->lock);
         BTRFS_I(inode)->last_sub_trans = root->log_transid;
+       spin_unlock(&BTRFS_I(inode)->lock);
         if (num_written > 0) {
                 err = generic_write_sync(file, pos, num_written);
                 if (err < 0)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h

index 937050a2b68edaf6bdd6027f23c6c6d37b8257d5..96b189b8898aa17dc2f9239d71b9964f8b70f58d 100644 (file)
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -136,9 +136,11 @@ struct btrfs_pending_snapshot {
  static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
                                               struct inode *inode)
  {
+       spin_lock(&BTRFS_I(inode)->lock);
         BTRFS_I(inode)->last_trans = trans->transaction->transid;
         BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+       spin_unlock(&BTRFS_I(inode)->lock);
  }
  
  int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 6c95159302dd2c5a45fa335af39f7bff3773ac9b..016c90fc85dbdaad7b7f566cb0b9acf83e78483d 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ insert:
  
                 if (btrfs_inode_generation(eb, src_item) == 0) {
                         struct extent_buffer *dst_eb = path->nodes[0];
+                       const u64 ino_size = btrfs_inode_size(eb, src_item);
  
+                       /*
+                        * For regular files an ino_size == 0 is used only when
+                        * logging that an inode exists, as part of a directory
+                        * fsync, and the inode wasn't fsynced before. In this
+                        * case don't set the size of the inode in the fs/subvol
+                        * tree, otherwise we would be throwing valid data away.
+                        */
                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
-                           S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+                           S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+                           ino_size != 0) {
                                 struct btrfs_map_token token;
-                               u64 ino_size = btrfs_inode_size(eb, src_item);
  
                                 btrfs_init_map_token(&token);
                                 btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -3124,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, struct inode *inode,
                           struct btrfs_path *path,
                           struct btrfs_path *dst_path, int key_type,
+                         struct btrfs_log_ctx *ctx,
                           u64 min_offset, u64 *last_offset_ret)
  {
         struct btrfs_key min_key;
@@ -3208,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                 src = path->nodes[0];
                 nritems = btrfs_header_nritems(src);
                 for (i = path->slots[0]; i < nritems; i++) {
+                       struct btrfs_dir_item *di;
+
                         btrfs_item_key_to_cpu(src, &min_key, i);
  
                         if (min_key.objectid != ino || min_key.type != key_type)
@@ -3218,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                                 err = ret;
                                 goto done;
                         }
+
+                       /*
+                        * We must make sure that when we log a directory entry,
+                        * the corresponding inode, after log replay, has a
+                        * matching link count. For example:
+                        *
+                        * touch foo
+                        * mkdir mydir
+                        * sync
+                        * ln foo mydir/bar
+                        * xfs_io -c "fsync" mydir
+                        * <crash>
+                        * <mount fs and log replay>
+                        *
+                        * Would result in a fsync log that when replayed, our
+                        * file inode would have a link count of 1, but we get
+                        * two directory entries pointing to the same inode.
+                        * After removing one of the names, it would not be
+                        * possible to remove the other name, which resulted
+                        * always in stale file handle errors, and would not
+                        * be possible to rmdir the parent directory, since
+                        * its i_size could never decrement to the value
+                        * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+                        */
+                       di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+                       btrfs_dir_item_key_to_cpu(src, di, &tmp);
+                       if (ctx &&
+                           (btrfs_dir_transid(src, di) == trans->transid ||
+                            btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+                           tmp.type != BTRFS_ROOT_ITEM_KEY)
+                               ctx->log_new_dentries = true;
                 }
                 path->slots[0] = nritems;
  
@@ -3279,7 +3321,8 @@ done:
  static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, struct inode *inode,
                           struct btrfs_path *path,
-                         struct btrfs_path *dst_path)
+                         struct btrfs_path *dst_path,
+                         struct btrfs_log_ctx *ctx)
  {
         u64 min_key;
         u64 max_key;
@@ -3291,7 +3334,7 @@ again:
         max_key = 0;
         while (1) {
                 ret = log_dir_items(trans, root, inode, path,
-                                   dst_path, key_type, min_key,
+                                   dst_path, key_type, ctx, min_key,
                                     &max_key);
                 if (ret)
                         return ret;
@@ -4067,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
         if (ret < 0) {
                 return ret;
         } else if (ret > 0) {
-               *size_ret = i_size_read(inode);
+               *size_ret = 0;
         } else {
                 struct btrfs_inode_item *item;
  
@@ -4374,15 +4417,18 @@ log_extents:
         }
  
         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-               ret = log_directory_changes(trans, root, inode, path, dst_path);
+               ret = log_directory_changes(trans, root, inode, path, dst_path,
+                                           ctx);
                 if (ret) {
                         err = ret;
                         goto out_unlock;
                 }
         }
  
+       spin_lock(&BTRFS_I(inode)->lock);
         BTRFS_I(inode)->logged_trans = trans->transid;
         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+       spin_unlock(&BTRFS_I(inode)->lock);
  out_unlock:
         if (unlikely(err))
                 btrfs_put_logged_extents(&logged_list);
@@ -4469,6 +4515,181 @@ out:
         return ret;
  }
  
+struct btrfs_dir_list {
+       u64 ino;
+       struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ *        CPU0                                        CPU1
+ *        ----                                        ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ *                                            lock(sb_internal#2);
+ *                                            lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ *    that while logging the inode new references (names) are added or removed
+ *    from the inode, leaving the logged inode item with a link count that does
+ *    not match the number of logged inode reference items. This is fine because
+ *    at log replay time we compute the real number of links and correct the
+ *    link count in the inode item (see replay_one_buffer() and
+ *    link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ *    has a size that doesn't match the sum of the lengths of all the logged
+ *    names. This does not result in a problem because if a dir_item key is
+ *    logged but its matching dir_index key is not logged, at log replay time we
+ *    don't use it to replay the respective name (see replay_one_name()). On the
+ *    other hand if only the dir_index key ends up being logged, the respective
+ *    name is added to the fs/subvol tree with both the dir_item and dir_index
+ *    keys created (see replay_one_name()).
+ *    The directory's inode item with a wrong i_size is not a problem as well,
+ *    since we don't use it at log replay time to set the i_size in the inode
+ *    item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *start_inode,
+                               struct btrfs_log_ctx *ctx)
+{
+       struct btrfs_root *log = root->log_root;
+       struct btrfs_path *path;
+       LIST_HEAD(dir_list);
+       struct btrfs_dir_list *dir_elem;
+       int ret = 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+       if (!dir_elem) {
+               btrfs_free_path(path);
+               return -ENOMEM;
+       }
+       dir_elem->ino = btrfs_ino(start_inode);
+       list_add_tail(&dir_elem->list, &dir_list);
+
+       while (!list_empty(&dir_list)) {
+               struct extent_buffer *leaf;
+               struct btrfs_key min_key;
+               int nritems;
+               int i;
+
+               dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+                                           list);
+               if (ret)
+                       goto next_dir_inode;
+
+               min_key.objectid = dir_elem->ino;
+               min_key.type = BTRFS_DIR_ITEM_KEY;
+               min_key.offset = 0;
+again:
+               btrfs_release_path(path);
+               ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+               if (ret < 0) {
+                       goto next_dir_inode;
+               } else if (ret > 0) {
+                       ret = 0;
+                       goto next_dir_inode;
+               }
+
+process_leaf:
+               leaf = path->nodes[0];
+               nritems = btrfs_header_nritems(leaf);
+               for (i = path->slots[0]; i < nritems; i++) {
+                       struct btrfs_dir_item *di;
+                       struct btrfs_key di_key;
+                       struct inode *di_inode;
+                       struct btrfs_dir_list *new_dir_elem;
+                       int log_mode = LOG_INODE_EXISTS;
+                       int type;
+
+                       btrfs_item_key_to_cpu(leaf, &min_key, i);
+                       if (min_key.objectid != dir_elem->ino ||
+                           min_key.type != BTRFS_DIR_ITEM_KEY)
+                               goto next_dir_inode;
+
+                       di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+                       type = btrfs_dir_type(leaf, di);
+                       if (btrfs_dir_transid(leaf, di) < trans->transid &&
+                           type != BTRFS_FT_DIR)
+                               continue;
+                       btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+                       if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+                               continue;
+
+                       di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+                                             root, NULL);
+                       if (IS_ERR(di_inode)) {
+                               ret = PTR_ERR(di_inode);
+                               goto next_dir_inode;
+                       }
+
+                       if (btrfs_inode_in_log(di_inode, trans->transid)) {
+                               iput(di_inode);
+                               continue;
+                       }
+
+                       ctx->log_new_dentries = false;
+                       if (type == BTRFS_FT_DIR)
+                               log_mode = LOG_INODE_ALL;
+                       btrfs_release_path(path);
+                       ret = btrfs_log_inode(trans, root, di_inode,
+                                             log_mode, 0, LLONG_MAX, ctx);
+                       iput(di_inode);
+                       if (ret)
+                               goto next_dir_inode;
+                       if (ctx->log_new_dentries) {
+                               new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+                                                      GFP_NOFS);
+                               if (!new_dir_elem) {
+                                       ret = -ENOMEM;
+                                       goto next_dir_inode;
+                               }
+                               new_dir_elem->ino = di_key.objectid;
+                               list_add_tail(&new_dir_elem->list, &dir_list);
+                       }
+                       break;
+               }
+               if (i == nritems) {
+                       ret = btrfs_next_leaf(log, path);
+                       if (ret < 0) {
+                               goto next_dir_inode;
+                       } else if (ret > 0) {
+                               ret = 0;
+                               goto next_dir_inode;
+                       }
+                       goto process_leaf;
+               }
+               if (min_key.offset < (u64)-1) {
+                       min_key.offset++;
+                       goto again;
+               }
+next_dir_inode:
+               list_del(&dir_elem->list);
+               kfree(dir_elem);
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
  /*
   * helper function around btrfs_log_inode to make sure newly created
   * parent directories also end up in the log.  A minimal inode and backref
@@ -4491,6 +4712,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         const struct dentry * const first_parent = parent;
         const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
                                  last_committed);
+       bool log_dentries = false;
+       struct inode *orig_inode = inode;
  
         sb = inode->i_sb;
  
@@ -4546,6 +4769,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 goto end_trans;
         }
  
+       if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+               log_dentries = true;
+
         while (1) {
                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                         break;
@@ -4582,7 +4808,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 dput(old_parent);
                 old_parent = parent;
         }
-       ret = 0;
+       if (log_dentries)
+               ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+       else
+               ret = 0;
  end_trans:
         dput(old_parent);
         if (ret < 0) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h

index 154990c26dcbc9d63508228b9125182b7168b80e..6916a781ea02cbdff331b002949369f41113d2d1 100644 (file)
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -29,6 +29,7 @@ struct btrfs_log_ctx {
         int log_ret;
         int log_transid;
         int io_err;
+       bool log_new_dentries;
         struct list_head list;
  };
  
@@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
         ctx->log_ret = 0;
         ctx->log_transid = 0;
         ctx->io_err = 0;
+       ctx->log_new_dentries = false;
         INIT_LIST_HEAD(&ctx->list);
  }
author	Filipe Manana <fdmanana@suse.com>
	Fri, 20 Mar 2015 17:19:46 +0000 (17:19 +0000)
committer	Chris Mason <clm@fb.com>
	Fri, 27 Mar 2015 00:56:23 +0000 (17:56 -0700)
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/transaction.h		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/tree-log.h		patch \| blob \| history